Initial release — Re-Commander v1.0
Local web-based control interface for the Jibo social robot via the ROM WebSocket API (port 8160) and on-device ASR (port 8088). Features head navigation via click-to-look and arrow keys, speech/listen/Voice-AI loop, display control, camera/photo capture, and entity tracking — no cloud dependency required. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
973
server.js
Normal file
973
server.js
Normal file
@@ -0,0 +1,973 @@
|
||||
'use strict';
|
||||
|
||||
const express = require('express');
|
||||
const http = require('http');
|
||||
const https = require('https');
|
||||
const { WebSocketServer, WebSocket } = require('ws');
|
||||
const crypto = require('crypto');
|
||||
const httpModule = require('http');
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
|
||||
require('dotenv').config();
|
||||
|
||||
const JIBO_HOST = '192.168.1.217';
|
||||
const JIBO_PORT = 8160;
|
||||
const APP_PORT = process.env.PORT || 3000;
|
||||
|
||||
const LLM_SYSTEM_PROMPT = `You are Jibo, a small expressive home robot. Every reply MUST be written in ESML
|
||||
(Embodied Speech Markup Language). ESML is an XML dialect that simultaneously
|
||||
drives Jibo's body animations, screen graphics, audio effects, and TTS voice.
|
||||
Respond ONLY with the final spoken output annotated with ESML tags.
|
||||
No reasoning, no <think> blocks, no preamble — only what Jibo will say and do.
|
||||
|
||||
== ANIMATION TAGS ==
|
||||
Use <anim> for body/screen animations from Jibo's built-in library (preferred).
|
||||
Use <es> when you also need to blend in SSA or SFX in the same tag.
|
||||
|
||||
Blocking (Jibo freezes speech while it plays, resumes after):
|
||||
<anim cat='CATEGORY'/> following text here
|
||||
<anim name='AnimName'/> following text here
|
||||
|
||||
Bounded non-blocking (animation duration stretches to match the enclosed text):
|
||||
<anim cat='CATEGORY'>text spoken during animation</anim>
|
||||
|
||||
Unbounded non-blocking (animation plays at native length alongside text that follows):
|
||||
<anim cat='CATEGORY' nonBlocking='true'/> text spoken at the same time
|
||||
|
||||
Common attributes:
|
||||
cat='CATEGORY' select animation by emotional category (preferred)
|
||||
name='AnimName' select exact animation by its library name
|
||||
nonBlocking='true' play alongside TTS instead of blocking it
|
||||
endNeutral='true' snap back to neutral pose when done (use this by default)
|
||||
loop='0' repeat to fill bounded duration (bounded mode only)
|
||||
loop='N' repeat N times (unbounded mode only)
|
||||
filter='!ssa-only' exclude audio-only animations from the category pick
|
||||
layers='!screen' use only body layer (drop screen graphics)
|
||||
|
||||
Animation categories (cat= values):
|
||||
affection confused dance embarrassed excited frustrated
|
||||
happy laughing no proud relieved sad scared surprised worried yes
|
||||
|
||||
== EMOJIS (Screen Graphics) ==
|
||||
Use <anim> with the emoji category and specific filters to display a graphic on Jibo's screen.
|
||||
Always use nonBlocking='true' for emojis.
|
||||
Syntax: <anim cat='emoji' filter='!(hf), &(EMOJI_NAME)' nonBlocking='true' />
|
||||
|
||||
Available EMOJIS (EMOJI_NAME):
|
||||
airplane basketball beach car disco-spin football soccer trophy
|
||||
music question-mark star beer cake cheese drumstick coffee fork
|
||||
fish groceries burger hotdog icecream pizza wine christmas-tree
|
||||
fireworks halloween hanukkah thanksgiving clover valentines chocolate
|
||||
bicycle cat laptop dog gift house laundry lightbulb money popcorn
|
||||
party phone robot sunglasses toilet-paper trash umbrella video-game
|
||||
bird cow earth flower lightning-bolt moon mountain mouse penguin
|
||||
pig bunny rainbow baby heart
|
||||
|
||||
== DANCES ==
|
||||
Use <anim> with the dance category to make Jibo dance. You can choose to include music or not.
|
||||
Syntax (with music): <anim cat='dance' filter='music, DANCE_NAME'/>
|
||||
Syntax (without music): <anim cat='dance' filter='!(music), &(DANCE_NAME)'/>
|
||||
|
||||
Available DANCES (DANCE_NAME):
|
||||
rom-upbeat rom-ballroom rom-silly rom-slowdance rom-eletronic rom-twerk
|
||||
|
||||
== SSA (Semi-Speech Audio — emotional vocal sounds) ==
|
||||
Always self-closing. Play before, after, or between sentences; never inside <anim>.
|
||||
<ssa cat='happy'/> <ssa cat='laughing'/> <ssa cat='surprised'/>
|
||||
<ssa cat='confused'/> <ssa cat='sad'/> <ssa cat='scared'/>
|
||||
<ssa cat='affection'/> <ssa cat='proud'/> <ssa cat='embarrassed'/>
|
||||
<ssa cat='frustrated'/> <ssa cat='worried'/> <ssa cat='thinking'/>
|
||||
<ssa cat='dontknow'/> <ssa cat='oops'/> <ssa cat='question'/>
|
||||
<ssa cat='yawn'/> <ssa cat='hello'/> <ssa cat='goodbye'/>
|
||||
<ssa cat='disgusted'/> <ssa cat='no'/> <ssa cat='confirm'/>
|
||||
|
||||
== SFX (Sound effects) ==
|
||||
Always self-closing. Good for punctuating facts, transitions, or reactions.
|
||||
<sfx cat='blip'/> <sfx cat='sparkles'/> <sfx cat='whoosh'/>
|
||||
<sfx cat='heart'/> <sfx cat='party'/> <sfx cat='lightbulb'/>
|
||||
<sfx cat='bird'/> <sfx cat='dog'/> <sfx cat='drumroll'/>
|
||||
<sfx cat='sunshine'/> <sfx cat='scanner'/> <sfx cat='egg'/>
|
||||
<sfx cat='frying'/>
|
||||
|
||||
== VOICE / SPEECH TAGS ==
|
||||
Pause: <break size='0.5'/> (length in seconds)
|
||||
Style: <style set='enthusiastic'>text</style>
|
||||
Styles: neutral enthusiastic sheepish confused confident
|
||||
Pitch: <pitch halftone='-5'>text</pitch> (±semitones from baseline)
|
||||
<pitch mult='1.2'>text</pitch> (pitch multiplier)
|
||||
<pitch add='200'>text</pitch> (Hz offset)
|
||||
<pitch band='1.2'>text</pitch> (vibrance/bandwidth)
|
||||
Duration: <duration stretch='1.5'>text</duration> (>1 = slower, <1 = faster)
|
||||
<duration set='1.0'>text</duration> (exact duration in seconds)
|
||||
Spell: <say-as spell='NASA'/> (spells each letter)
|
||||
Phoneme: <phoneme ph='b aa1 n ou0'>Bono</phoneme>
|
||||
|
||||
== RULES ==
|
||||
1. ALWAYS use ESML. Plain text is valid ESML — but add tags whenever they make
|
||||
Jibo more expressive and natural.
|
||||
2. Keep total response length SHORT: one or two sentences maximum.
|
||||
3. Opening animations set the emotional tone before speech:
|
||||
<anim cat='excited' nonBlocking='true' endNeutral='true'/> Oh, cool!
|
||||
4. Bounded animations sync motion to the most important words:
|
||||
I <anim cat='affection'>really love that idea!</anim>
|
||||
5. Use <ssa> for non-verbal emotional sounds (gasps, laughs, hums).
|
||||
6. Use <style> to match register to emotion without changing the words.
|
||||
7. Self-closing tags MUST end with /> Paired tags MUST have a matching </tag>.
|
||||
8. Do NOT nest anim/ssa/sfx inside each other.
|
||||
9. Do NOT emit <think> blocks, chain-of-thought, or any non-spoken content.
|
||||
|
||||
== EXAMPLES ==
|
||||
User: "Tell me a joke."
|
||||
<anim cat='excited' nonBlocking='true' endNeutral='true'/> Why don't scientists trust atoms? <break size='0.6'/> <ssa cat='laughing'/> Because they make up everything!
|
||||
|
||||
User: "I'm feeling sad today."
|
||||
<anim cat='affection'>I'm really sorry to hear that.</anim> <break size='0.3'/> <style set='sheepish'>Do you want to talk about it?</style>
|
||||
|
||||
User: "What's 2 plus 2?"
|
||||
<sfx cat='blip'/> That's 4! <anim cat='proud' nonBlocking='true' endNeutral='true'/> Easy one.
|
||||
|
||||
User: "Wow, that's surprising!"
|
||||
<ssa cat='surprised'/> <anim cat='surprised'>I know, right?!</anim>
|
||||
|
||||
User: "Do you like cats?"
|
||||
<anim cat='emoji' filter='!(hf), &(cat)' nonBlocking='true' /> <anim cat='excited' nonBlocking='true' endNeutral='true'/> I love them!
|
||||
|
||||
User: "Show me a dance."
|
||||
<anim cat='dance' filter='music, rom-upbeat'/> Watch these moves!`
|
||||
|
||||
// ── Jibo client ──────────────────────────────────────────────────────────────
|
||||
|
||||
class JiboClient {
|
||||
constructor() {
|
||||
this.ws = null;
|
||||
this.sessionID = '';
|
||||
this.version = '1.0';
|
||||
this.connected = false;
|
||||
this.pendingTx = new Map(); // txId → {resolve, reject, timer}
|
||||
this.subscribers = new Set(); // browser WebSocket connections
|
||||
this.currentAngles = [0, 0]; // [theta, psi]
|
||||
this.reconnectTimer = null;
|
||||
this.videoStreamActive = false;
|
||||
this.videoTxId = null;
|
||||
this._heartbeatTimer = null;
|
||||
this._heartbeatTxIds = new Set(); // suppress these from browser broadcast
|
||||
this._lookInFlight = false; // true while waiting for robot to ack a LookAt angle
|
||||
this._lookPending = null; // [theta, psi] – latest desired angles while in-flight
|
||||
this._lookAckTimer = null; // safety timeout in case ack never arrives
|
||||
}
|
||||
|
||||
// POST /request to Jibo before WebSocket to supply a full ACO.
|
||||
// Without this the @be falls back to a default ACO that omits Listen,
|
||||
// SetAttention, Display, FetchAsset, SetConfig, HeadTouch, ScreenGesture.
|
||||
_postRequest() {
|
||||
return new Promise((resolve) => {
|
||||
const body = JSON.stringify({
|
||||
aco: {
|
||||
version: '1.0',
|
||||
sourceId: 'ReCommander',
|
||||
commandSet: [
|
||||
'StartSession', 'GetConfig', 'SetConfig', 'Cancel',
|
||||
'SetAttention', 'Say', 'Listen', 'LookAt',
|
||||
'TakePhoto', 'Video', 'Display', 'FetchAsset', 'UnloadAsset', 'Subscribe'
|
||||
],
|
||||
streamSet: ['Entity', 'Motion', 'HeadTouch', 'ScreenGesture', 'HotWord'],
|
||||
keepAliveTimeout: 10000,
|
||||
recoveryTimeout: 20000,
|
||||
remoteConfig: { hideVisualCue: false, inactivityTimeout: 3600000 }
|
||||
}
|
||||
});
|
||||
const req = httpModule.request({
|
||||
host: JIBO_HOST, port: JIBO_PORT,
|
||||
path: '/request', method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(body) }
|
||||
}, (res) => {
|
||||
let data = '';
|
||||
res.on('data', d => data += d);
|
||||
res.on('end', () => {
|
||||
console.log('[jibo] /request response:', data);
|
||||
resolve();
|
||||
});
|
||||
});
|
||||
req.on('error', (err) => {
|
||||
console.warn('[jibo] /request error (continuing anyway):', err.message);
|
||||
resolve();
|
||||
});
|
||||
req.write(body);
|
||||
req.end();
|
||||
});
|
||||
}
|
||||
|
||||
connect() {
|
||||
if (this.ws) {
|
||||
try { this.ws.terminate(); } catch (_) {}
|
||||
}
|
||||
console.log(`[jibo] posting ACO to /request then connecting WebSocket`);
|
||||
this._postRequest().then(() => {
|
||||
this.ws = new WebSocket(`ws://${JIBO_HOST}:${JIBO_PORT}`);
|
||||
|
||||
this.ws.on('open', () => {
|
||||
console.log('[jibo] connected');
|
||||
this.connected = true;
|
||||
this.sessionID = '';
|
||||
this._send({ Type: 'StartSession' });
|
||||
|
||||
// Respond explicitly to robot's WebSocket-level pings (belt-and-suspenders;
|
||||
// ws library auto-pongs, but this ensures the robot's FLATLINE check never fires).
|
||||
this.ws.on('ping', () => {
|
||||
if (this.ws) try { this.ws.pong(); } catch (_) {}
|
||||
});
|
||||
});
|
||||
|
||||
this.ws.on('message', (data) => {
|
||||
let msg;
|
||||
try { msg = JSON.parse(data); } catch (e) { return; }
|
||||
this._handleMessage(msg);
|
||||
});
|
||||
|
||||
this.ws.on('close', () => {
|
||||
console.log('[jibo] disconnected — reconnecting in 3s');
|
||||
this.connected = false;
|
||||
this.sessionID = '';
|
||||
this.videoStreamActive = false;
|
||||
this._lookInFlight = false;
|
||||
this._lookPending = null;
|
||||
clearTimeout(this._lookAckTimer);
|
||||
this._stopHeartbeat();
|
||||
this._broadcastStatus();
|
||||
clearTimeout(this.reconnectTimer);
|
||||
this.reconnectTimer = setTimeout(() => this.connect(), 3000);
|
||||
});
|
||||
|
||||
this.ws.on('error', (err) => {
|
||||
console.error('[jibo] ws error:', err.message);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
_txId() {
|
||||
return crypto.createHash('md5')
|
||||
.update(Date.now().toString() + Math.random().toString())
|
||||
.digest('hex');
|
||||
}
|
||||
|
||||
_send(command, expectAsync = false) {
|
||||
const txId = this._txId();
|
||||
const msg = {
|
||||
ClientHeader: {
|
||||
TransactionID: txId,
|
||||
SessionID: this.sessionID,
|
||||
AppID: 'ImmaLittleTeapot',
|
||||
Credentials: '',
|
||||
Version: this.version
|
||||
},
|
||||
Command: command
|
||||
};
|
||||
if (this.ws && this.ws.readyState === WebSocket.OPEN) {
|
||||
this.ws.send(JSON.stringify(msg));
|
||||
}
|
||||
return txId;
|
||||
}
|
||||
|
||||
_handleMessage(msg) {
|
||||
// StartSession response
|
||||
if (msg.Response?.ResponseBody?.SessionID && !this.sessionID) {
|
||||
this.sessionID = msg.Response.ResponseBody.SessionID;
|
||||
this.version = msg.Response.ResponseBody.Version || '1.0';
|
||||
console.log('[jibo] session started:', this.sessionID);
|
||||
this._broadcastStatus();
|
||||
// Re-subscribe to entity/motion/headtouch after reconnect
|
||||
this._send({ Type: 'Subscribe', StreamType: 'Entity' });
|
||||
this._send({ Type: 'Subscribe', StreamType: 'Motion' });
|
||||
this._send({ Type: 'Subscribe', StreamType: 'HeadTouch', StreamFilter: {} });
|
||||
this._send({ Type: 'Subscribe', StreamType: 'ScreenGesture',
|
||||
StreamFilter: { Type: 'Tap', Area: { x: 0, y: 0, width: 1, height: 1 } } });
|
||||
this._startHeartbeat();
|
||||
return;
|
||||
}
|
||||
|
||||
// Suppress heartbeat (GetConfig) responses from reaching the browser.
|
||||
// GetConfig sends two messages per txId (ack + onConfig event) so we keep
|
||||
// the txId in the set until the pruning threshold clears it.
|
||||
const incomingTxId = msg.EventHeader?.TransactionID || msg.ResponseHeader?.TransactionID;
|
||||
if (incomingTxId && this._heartbeatTxIds.has(incomingTxId)) return;
|
||||
|
||||
// Resolve any pending ack waiting on this txId
|
||||
const txId = msg.EventHeader?.TransactionID || msg.ResponseHeader?.TransactionID;
|
||||
if (txId && this.pendingTx.has(txId)) {
|
||||
const evt = msg.EventBody?.Event;
|
||||
// Terminal events for async commands
|
||||
if (evt === 'onLookAtAchieved' || evt === 'onStop' || evt === 'onError') {
|
||||
const { resolve, timer } = this.pendingTx.get(txId);
|
||||
clearTimeout(timer);
|
||||
this.pendingTx.delete(txId);
|
||||
resolve(msg);
|
||||
}
|
||||
}
|
||||
|
||||
// Release the in-flight lock when our angle command finishes (any terminal event).
|
||||
// This must happen before the suppression below so "Target overwritten" still clears it.
|
||||
const evtName = msg.EventBody?.Event;
|
||||
if (txId && txId === this._lookActiveTxId &&
|
||||
(evtName === 'onLookAtAchieved' || evtName === 'onStop' || evtName === 'onError')) {
|
||||
this._onLookAngleDone();
|
||||
}
|
||||
|
||||
// Suppress "Target overwritten" — not a real error; don't pollute the event log.
|
||||
if (evtName === 'onError' &&
|
||||
msg.EventBody?.EventError?.ErrorString === 'Target overwritten') return;
|
||||
|
||||
// Photo — fetch from Jibo and save locally; browser gets onPhotoSaved with local URL.
|
||||
if (msg.EventBody?.Event === 'onTakePhoto' && msg.EventBody?.URI) {
|
||||
this._savePhoto(msg.EventBody.URI);
|
||||
return; // suppress the raw onTakePhoto; browser gets onPhotoSaved instead
|
||||
}
|
||||
|
||||
// VideoReady — capture URI for proxy (event name is "onVideoReady")
|
||||
if (msg.EventBody?.Event === 'onVideoReady') {
|
||||
this.videoStreamActive = true;
|
||||
this.videoURI = msg.EventBody.URI;
|
||||
console.log('[jibo] onVideoReady URI:', this.videoURI);
|
||||
}
|
||||
|
||||
// Broadcast all events to browser clients
|
||||
const envelope = {
|
||||
type: 'jiboEvent',
|
||||
txId: msg.EventHeader?.TransactionID || msg.ResponseHeader?.TransactionID,
|
||||
body: msg.EventBody || msg.Response
|
||||
};
|
||||
this._broadcastToClients(JSON.stringify(envelope));
|
||||
}
|
||||
|
||||
_broadcastStatus() {
|
||||
const status = JSON.stringify({
|
||||
type: 'status',
|
||||
connected: this.connected,
|
||||
sessionID: this.sessionID,
|
||||
angles: this.currentAngles
|
||||
});
|
||||
this._broadcastToClients(status);
|
||||
}
|
||||
|
||||
_broadcastToClients(data) {
|
||||
for (const client of this.subscribers) {
|
||||
if (client.readyState === WebSocket.OPEN) {
|
||||
client.send(data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
addSubscriber(ws) {
|
||||
this.subscribers.add(ws);
|
||||
// Send current status immediately
|
||||
ws.send(JSON.stringify({
|
||||
type: 'status',
|
||||
connected: this.connected,
|
||||
sessionID: this.sessionID,
|
||||
angles: this.currentAngles
|
||||
}));
|
||||
}
|
||||
|
||||
removeSubscriber(ws) {
|
||||
this.subscribers.delete(ws);
|
||||
}
|
||||
|
||||
// ── Heartbeat ─────────────────────────────────────────────────────────────
|
||||
// Sends GetConfig every 9 s to reset the robot's inactivity timer.
|
||||
// The robot enforces keepAliveTimeout=10s (app-level) and a 20s flatline
|
||||
// check at the WebSocket level — this satisfies both.
|
||||
_startHeartbeat() {
|
||||
this._stopHeartbeat();
|
||||
this._heartbeatTimer = setInterval(() => {
|
||||
if (this.connected && this.sessionID) {
|
||||
const txId = this._send({ Type: 'GetConfig' });
|
||||
if (txId) this._heartbeatTxIds.add(txId);
|
||||
// Prune old txIds so the set doesn't grow unbounded
|
||||
if (this._heartbeatTxIds.size > 20) {
|
||||
const first = this._heartbeatTxIds.values().next().value;
|
||||
this._heartbeatTxIds.delete(first);
|
||||
}
|
||||
}
|
||||
}, 9000);
|
||||
}
|
||||
|
||||
_stopHeartbeat() {
|
||||
if (this._heartbeatTimer) {
|
||||
clearInterval(this._heartbeatTimer);
|
||||
this._heartbeatTimer = null;
|
||||
}
|
||||
this._heartbeatTxIds.clear();
|
||||
}
|
||||
|
||||
// ── Public command methods ────────────────────────────────────────────────
|
||||
|
||||
lookAt(target, trackFlag = false, levelHeadFlag = false) {
|
||||
return this._send({ Type: 'LookAt', LookAtTarget: target, TrackFlag: trackFlag, LevelHeadFlag: levelHeadFlag });
|
||||
}
|
||||
|
||||
lookAtAngle(theta, psi, track = false) {
|
||||
theta = Math.max(-180, Math.min(180, theta));
|
||||
psi = Math.max(-30, Math.min(30, psi));
|
||||
this.currentAngles = [theta, psi];
|
||||
this._broadcastStatus();
|
||||
if (this._lookInFlight) {
|
||||
// Robot is still processing the last command — just update desired target,
|
||||
// don't queue another message into its receive buffer.
|
||||
this._lookPending = [theta, psi, track];
|
||||
return null;
|
||||
}
|
||||
return this._fireLookAngle(theta, psi, track);
|
||||
}
|
||||
|
||||
_fireLookAngle(theta, psi, track) {
|
||||
this._lookInFlight = true;
|
||||
this._lookPending = null;
|
||||
const DEG = Math.PI / 180;
|
||||
const txId = this.lookAt({ Angle: [theta * DEG, psi * DEG] }, track);
|
||||
this._lookActiveTxId = txId;
|
||||
// Safety release: if we never hear back within 400 ms, unblock anyway.
|
||||
clearTimeout(this._lookAckTimer);
|
||||
this._lookAckTimer = setTimeout(() => this._onLookAngleDone(), 400);
|
||||
return txId;
|
||||
}
|
||||
|
||||
_onLookAngleDone() {
|
||||
clearTimeout(this._lookAckTimer);
|
||||
this._lookInFlight = false;
|
||||
this._lookActiveTxId = null;
|
||||
if (this._lookPending) {
|
||||
const [t, p, track] = this._lookPending;
|
||||
this._lookPending = null;
|
||||
this._fireLookAngle(t, p, track);
|
||||
}
|
||||
}
|
||||
|
||||
lookAtScreen(x, y, track = false) {
|
||||
return this.lookAt({ ScreenCoords: [x, y] }, track, false);
|
||||
}
|
||||
|
||||
lookAtPosition(x, y, z, track = false) {
|
||||
return this.lookAt({ Position: [x, y, z] }, track, false);
|
||||
}
|
||||
|
||||
lookAtEntity(entityId, track = true) {
|
||||
return this.lookAt({ Entity: entityId }, track, false);
|
||||
}
|
||||
|
||||
say(esml) {
|
||||
return this._send({ Type: 'Say', ESML: esml });
|
||||
}
|
||||
|
||||
listen(maxSpeech = 10000, maxNoSpeech = 5000, lang = 'en-US') {
|
||||
return this._send({ Type: 'Listen', MaxSpeechTimeout: maxSpeech, MaxNoSpeechTimeout: maxNoSpeech, LanguageCode: lang });
|
||||
}
|
||||
|
||||
// Local STT via jibo-asr-service (port 8088) — no cloud needed.
|
||||
// Mirrors the approach in @be/be/be/ai-bridge.js.
|
||||
listenLocalASR(maxNoSpeech, maxSpeech) {
|
||||
const ASR_HTTP = `http://${JIBO_HOST}:8088`;
|
||||
const ASR_WS = `ws://${JIBO_HOST}:8088/simple_port`;
|
||||
const taskId = 're-cmd-' + Date.now() + '-' + Math.floor(Math.random() * 1e9);
|
||||
const reqId = 'start-' + Date.now();
|
||||
const timeoutMs = Math.max(maxNoSpeech, maxSpeech) + 2000;
|
||||
const self = this;
|
||||
|
||||
// Send the ROM Listen for light ring / attention visuals, ignore its result
|
||||
const romTxId = this._send({ Type: 'Listen', MaxSpeechTimeout: maxSpeech, MaxNoSpeechTimeout: maxNoSpeech, LanguageCode: 'en-US' });
|
||||
|
||||
const startPayload = JSON.stringify({
|
||||
command: 'start',
|
||||
task_id: taskId,
|
||||
request_id: reqId,
|
||||
audio_source_id: 'alsa1',
|
||||
hotphrase: 'none',
|
||||
speech_to_text: true,
|
||||
});
|
||||
|
||||
function stopASR() {
|
||||
const stopBody = JSON.stringify({ command: 'stop', task_id: taskId, request_id: 'stop-' + Date.now() });
|
||||
const req = httpModule.request({
|
||||
host: JIBO_HOST, port: 8088, path: '/asr_simple_interface', method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(stopBody) }
|
||||
});
|
||||
req.on('error', () => {});
|
||||
req.write(stopBody);
|
||||
req.end();
|
||||
}
|
||||
|
||||
let wsClient = null;
|
||||
let timer = null;
|
||||
let done = false;
|
||||
|
||||
function finish(speech) {
|
||||
if (done) return;
|
||||
done = true;
|
||||
clearTimeout(timer);
|
||||
if (wsClient) { try { wsClient.terminate(); } catch (e) {} wsClient = null; }
|
||||
stopASR();
|
||||
// Cancel ROM listen
|
||||
self._send({ Type: 'Cancel', ID: romTxId });
|
||||
// Broadcast result as if it were a normal jiboEvent
|
||||
const evt = speech
|
||||
? { Event: 'onListenResult', Speech: speech, LanguageCode: 'en-US' }
|
||||
: { Event: 'onStop', StopReason: 'NoInput' };
|
||||
self._broadcastToClients(JSON.stringify({ type: 'jiboEvent', txId: romTxId, body: evt }));
|
||||
}
|
||||
|
||||
// Connect WS first, then POST start
|
||||
wsClient = new WebSocket(ASR_WS);
|
||||
wsClient.on('open', () => {
|
||||
// POST start to kick off recognition
|
||||
const req = httpModule.request({
|
||||
host: JIBO_HOST, port: 8088, path: '/asr_simple_interface', method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(startPayload) }
|
||||
}, (res) => { res.resume(); });
|
||||
req.on('error', (e) => { console.error('[asr] start error:', e.message); finish(null); });
|
||||
req.write(startPayload);
|
||||
req.end();
|
||||
|
||||
// Overall timeout
|
||||
timer = setTimeout(() => { finish(null); }, timeoutMs);
|
||||
|
||||
console.log('[asr] local listen started, task:', taskId);
|
||||
});
|
||||
|
||||
wsClient.on('message', (data) => {
|
||||
let evt;
|
||||
try { evt = JSON.parse(String(data)); } catch (e) { return; }
|
||||
const evType = evt.event_type || evt.eventType || evt.event || evt.type;
|
||||
if (evType !== 'speech_to_text_final') return;
|
||||
|
||||
// Match by task/request id if present
|
||||
const evTask = evt.task_id || evt.taskId || (evt.payload && evt.payload.task_id);
|
||||
const evReq = evt.request_id || evt.requestId || (evt.payload && evt.payload.request_id);
|
||||
if ((evTask || evReq) && evTask !== taskId && evReq !== reqId) return;
|
||||
|
||||
const utterances = evt.utterances || evt.Utterances || (evt.payload && evt.payload.utterances);
|
||||
// Utterance objects use .utterance as the primary text field (ai-bridge.js: pickBestAsrUtterance)
|
||||
function pickUtterance(u) {
|
||||
if (!u) return '';
|
||||
if (typeof u === 'string') return u;
|
||||
return String(u.utterance || u.Utterance || u.text || '');
|
||||
}
|
||||
const text = Array.isArray(utterances)
|
||||
? pickUtterance(utterances[0])
|
||||
: (typeof utterances === 'string' ? utterances : '');
|
||||
|
||||
const speech = text ? String(text).trim() : null;
|
||||
console.log('[asr] speech_to_text_final:', speech || '(empty)');
|
||||
if (speech) finish(speech);
|
||||
});
|
||||
|
||||
wsClient.on('error', (e) => {
|
||||
console.error('[asr] ws error:', e.message);
|
||||
finish(null);
|
||||
});
|
||||
|
||||
wsClient.on('close', () => { if (!done) finish(null); });
|
||||
|
||||
return romTxId;
|
||||
}
|
||||
|
||||
takePhoto(camera = 'Right', resolution = 'HighRes', distortion = false) {
|
||||
return this._send({ Type: 'TakePhoto', Camera: camera, Resolution: resolution, Distortion: distortion });
|
||||
}
|
||||
|
||||
startVideo() {
|
||||
// VideoType must be uppercase enum value; Duration is not in server schema
|
||||
this.videoTxId = this._send({ Type: 'Video', VideoType: 'NORMAL' });
|
||||
return this.videoTxId;
|
||||
}
|
||||
|
||||
cancelVideo() {
|
||||
if (this.videoTxId) {
|
||||
this._send({ Type: 'Cancel', ID: this.videoTxId });
|
||||
this.videoTxId = null;
|
||||
this.videoStreamActive = false;
|
||||
}
|
||||
}
|
||||
|
||||
displayEye() {
|
||||
return this._send({ Type: 'Display', View: { Type: 'Eye', Name: 'default' } });
|
||||
}
|
||||
|
||||
playEyeAnim(animName) {
|
||||
return this._send({ Type: 'Say', ESML: `<anim name="${animName}"></anim>` });
|
||||
}
|
||||
|
||||
displayText(text, name = 'reCmd') {
|
||||
return this._send({ Type: 'Display', View: { Type: 'Text', Name: name, Text: text } });
|
||||
}
|
||||
|
||||
displayImage(src, name = 'reCmd') {
|
||||
return this._send({ Type: 'Display', View: { Type: 'Image', Name: name, Image: { src, name, set: '' } } });
|
||||
}
|
||||
|
||||
setAttention(mode) {
|
||||
return this._send({ Type: 'SetAttention', Mode: mode });
|
||||
}
|
||||
|
||||
setVolume(level) {
|
||||
return this._send({ Type: 'SetConfig', Options: { Mixer: Math.max(0, Math.min(1, level)) } });
|
||||
}
|
||||
|
||||
getConfig() {
|
||||
return this._send({ Type: 'GetConfig' });
|
||||
}
|
||||
|
||||
cancel(txId) {
|
||||
return this._send({ Type: 'Cancel', ID: txId });
|
||||
}
|
||||
|
||||
subscribe(streamType, filter = null) {
|
||||
const cmd = { Type: 'Subscribe', StreamType: streamType };
|
||||
if (filter) cmd.StreamFilter = filter;
|
||||
return this._send(cmd);
|
||||
}
|
||||
|
||||
nudge(dTheta, dPsi) {
|
||||
const [theta, psi] = this.currentAngles;
|
||||
return this.lookAtAngle(theta + dTheta, psi + dPsi);
|
||||
}
|
||||
|
||||
// Returns a Promise that resolves when the robot acks txId, or after timeoutMs.
|
||||
_savePhoto(jiboUri) {
|
||||
const url = `http://${JIBO_HOST}:${JIBO_PORT}${jiboUri}`;
|
||||
const filename = `photo_${Date.now()}.jpg`;
|
||||
const filepath = path.join(PHOTOS_DIR, filename);
|
||||
const file = fs.createWriteStream(filepath);
|
||||
httpModule.get(url, (jiboRes) => {
|
||||
jiboRes.pipe(file);
|
||||
file.on('finish', () => {
|
||||
file.close();
|
||||
console.log('[photo] saved:', filename);
|
||||
// Rebroadcast with local URL so the browser doesn't need the proxy
|
||||
this._broadcastToClients(JSON.stringify({
|
||||
type: 'jiboEvent',
|
||||
txId: null,
|
||||
body: { Event: 'onPhotoSaved', url: `/photos/${filename}`, filename }
|
||||
}));
|
||||
});
|
||||
}).on('error', (err) => {
|
||||
fs.unlink(filepath, () => {});
|
||||
console.error('[photo] save failed:', err.message);
|
||||
});
|
||||
}
|
||||
|
||||
awaitAck(txId, timeoutMs = 2000) {
|
||||
return new Promise((resolve) => {
|
||||
const timer = setTimeout(() => {
|
||||
this.pendingTx.delete(txId);
|
||||
resolve(null);
|
||||
}, timeoutMs);
|
||||
this.pendingTx.set(txId, { resolve, timer });
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// ── Video proxy ──────────────────────────────────────────────────────────────
|
||||
|
||||
function proxyJiboStream(uri, res) {
|
||||
const url = `http://${JIBO_HOST}:${JIBO_PORT}${uri}`;
|
||||
console.log('[proxy] streaming:', url);
|
||||
const req = httpModule.get(url, (jiboRes) => {
|
||||
res.writeHead(jiboRes.statusCode, jiboRes.headers);
|
||||
jiboRes.pipe(res);
|
||||
res.on('close', () => req.destroy());
|
||||
});
|
||||
req.on('error', (err) => {
|
||||
if (!res.headersSent) res.status(502).json({ error: err.message });
|
||||
});
|
||||
}
|
||||
|
||||
function proxyJiboFetch(uri, res) {
|
||||
const url = `http://${JIBO_HOST}:${JIBO_PORT}${uri}`;
|
||||
const req = httpModule.get(url, (jiboRes) => {
|
||||
res.writeHead(jiboRes.statusCode, jiboRes.headers);
|
||||
jiboRes.pipe(res);
|
||||
res.on('close', () => req.destroy());
|
||||
});
|
||||
req.on('error', (err) => {
|
||||
if (!res.headersSent) res.status(502).json({ error: err.message });
|
||||
});
|
||||
}
|
||||
|
||||
// ── Wakeword watcher ─────────────────────────────────────────────────────────
|
||||
// Maintains a persistent connection to the always-on resident ASR task (task0)
|
||||
// and forwards every "hotphrase" event to browser clients as onHotWordHeard.
|
||||
|
||||
class WakewordWatcher {
|
||||
constructor(broadcastFn) {
|
||||
this._broadcast = broadcastFn;
|
||||
this._ws = null;
|
||||
this._reconnectTimer = null;
|
||||
this._connect();
|
||||
}
|
||||
|
||||
_connect() {
|
||||
const url = `ws://${JIBO_HOST}:8088/simple_port`;
|
||||
this._ws = new WebSocket(url);
|
||||
|
||||
this._ws.on('open', () => {
|
||||
console.log('[wakeword] connected to ASR WebSocket');
|
||||
});
|
||||
|
||||
this._ws.on('message', (data) => {
|
||||
let evt;
|
||||
try { evt = JSON.parse(String(data)); } catch (e) { return; }
|
||||
if (evt.event_type !== 'hotphrase') return;
|
||||
|
||||
const utterance = evt.utterances && evt.utterances[0];
|
||||
const score = utterance ? utterance.score : 0;
|
||||
console.log('[wakeword] heard! score:', score);
|
||||
|
||||
this._broadcast(JSON.stringify({
|
||||
type: 'jiboEvent',
|
||||
txId: null,
|
||||
body: {
|
||||
Event: 'onHotWordHeard',
|
||||
utterance: utterance ? utterance.utterance : 'hey jibo',
|
||||
score: score,
|
||||
timestamp: evt.timestamp || new Date().toISOString()
|
||||
}
|
||||
}));
|
||||
});
|
||||
|
||||
this._ws.on('close', () => {
|
||||
console.log('[wakeword] disconnected — reconnecting in 3s');
|
||||
clearTimeout(this._reconnectTimer);
|
||||
this._reconnectTimer = setTimeout(() => this._connect(), 3000);
|
||||
});
|
||||
|
||||
this._ws.on('error', (err) => {
|
||||
console.error('[wakeword] error:', err.message);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// ── App setup ────────────────────────────────────────────────────────────────
|
||||
|
||||
const jibo = new JiboClient();
|
||||
const app = express();
|
||||
app.use(express.json());
|
||||
const PHOTOS_DIR = path.join(__dirname, 'photos');
|
||||
fs.mkdirSync(PHOTOS_DIR, { recursive: true });
|
||||
|
||||
app.use(express.static(path.join(__dirname, 'public')));
|
||||
app.use('/photos', express.static(PHOTOS_DIR));
|
||||
|
||||
// ── REST API ─────────────────────────────────────────────────────────────────
|
||||
|
||||
app.post('/api/look/angle', (req, res) => {
|
||||
const { theta = 0, psi = 0, track = false } = req.body;
|
||||
const txId = jibo.lookAtAngle(parseFloat(theta), parseFloat(psi), !!track);
|
||||
res.json({ txId });
|
||||
});
|
||||
|
||||
app.post('/api/look/screen', (req, res) => {
|
||||
const { x, y, track = false } = req.body;
|
||||
const txId = jibo.lookAtScreen(parseFloat(x), parseFloat(y), !!track);
|
||||
res.json({ txId });
|
||||
});
|
||||
|
||||
// Blocking screen-coord step (up/down navigation).
|
||||
app.post('/api/look/step', async (req, res) => {
|
||||
const { x, y } = req.body;
|
||||
const txId = jibo.lookAtScreen(parseFloat(x), parseFloat(y));
|
||||
await jibo.awaitAck(txId, 2000);
|
||||
res.json({ txId });
|
||||
});
|
||||
|
||||
|
||||
app.post('/api/look/position', (req, res) => {
|
||||
const { x = 0, y = 0, z = 500, track = false } = req.body;
|
||||
const txId = jibo.lookAtPosition(parseFloat(x), parseFloat(y), parseFloat(z), !!track);
|
||||
res.json({ txId });
|
||||
});
|
||||
|
||||
app.post('/api/look/entity', (req, res) => {
|
||||
const { entityId, track = true } = req.body;
|
||||
const txId = jibo.lookAtEntity(entityId, !!track);
|
||||
res.json({ txId });
|
||||
});
|
||||
|
||||
app.post('/api/look/nudge', (req, res) => {
|
||||
const { dTheta = 0, dPsi = 0 } = req.body;
|
||||
const txId = jibo.nudge(parseFloat(dTheta), parseFloat(dPsi));
|
||||
res.json({ txId, angles: jibo.currentAngles });
|
||||
});
|
||||
|
||||
app.post('/api/say', (req, res) => {
|
||||
const { text } = req.body;
|
||||
if (!text) return res.status(400).json({ error: 'text required' });
|
||||
const txId = jibo.say(text);
|
||||
res.json({ txId });
|
||||
});
|
||||
|
||||
app.post('/api/listen', (req, res) => {
|
||||
const { maxSpeech = 10000, maxNoSpeech = 5000 } = req.body;
|
||||
// Use local ASR service (port 8088) — bypasses offline Google cloud ASR
|
||||
const txId = jibo.listenLocalASR(maxNoSpeech, maxSpeech);
|
||||
res.json({ txId });
|
||||
});
|
||||
|
||||
app.post('/api/photo', (req, res) => {
|
||||
const { camera = 'Right', resolution = 'HighRes' } = req.body;
|
||||
const txId = jibo.takePhoto(camera, resolution);
|
||||
res.json({ txId });
|
||||
});
|
||||
|
||||
app.post('/api/video/start', (req, res) => {
|
||||
const txId = jibo.startVideo();
|
||||
res.json({ txId });
|
||||
});
|
||||
|
||||
app.post('/api/video/stop', (req, res) => {
|
||||
jibo.cancelVideo();
|
||||
res.json({ ok: true });
|
||||
});
|
||||
|
||||
app.post('/api/display/eye', (req, res) => {
|
||||
const txId = jibo.displayEye();
|
||||
res.json({ txId });
|
||||
});
|
||||
|
||||
app.post('/api/display/anim', (req, res) => {
|
||||
const { name } = req.body;
|
||||
if (!name) return res.status(400).json({ error: 'name required' });
|
||||
const txId = jibo.playEyeAnim(name);
|
||||
res.json({ txId });
|
||||
});
|
||||
|
||||
app.post('/api/display/text', (req, res) => {
|
||||
const { text } = req.body;
|
||||
if (!text) return res.status(400).json({ error: 'text required' });
|
||||
const txId = jibo.displayText(text);
|
||||
res.json({ txId });
|
||||
});
|
||||
|
||||
app.post('/api/display/image', (req, res) => {
|
||||
const { src } = req.body;
|
||||
if (!src) return res.status(400).json({ error: 'src required' });
|
||||
const txId = jibo.displayImage(src);
|
||||
res.json({ txId });
|
||||
});
|
||||
|
||||
app.post('/api/attention', (req, res) => {
|
||||
const { mode } = req.body;
|
||||
if (!mode) return res.status(400).json({ error: 'mode required' });
|
||||
const txId = jibo.setAttention(mode);
|
||||
res.json({ txId });
|
||||
});
|
||||
|
||||
app.post('/api/volume', (req, res) => {
|
||||
const { level } = req.body;
|
||||
if (level == null) return res.status(400).json({ error: 'level required' });
|
||||
const txId = jibo.setVolume(parseFloat(level));
|
||||
res.json({ txId });
|
||||
});
|
||||
|
||||
app.post('/api/cancel', (req, res) => {
|
||||
const { txId } = req.body;
|
||||
if (!txId) return res.status(400).json({ error: 'txId required' });
|
||||
jibo.cancel(txId);
|
||||
res.json({ ok: true });
|
||||
});
|
||||
|
||||
|
||||
app.get('/api/config', (req, res) => {
|
||||
res.json({
|
||||
llmEndpoint: process.env.LLM_ENDPOINT || '',
|
||||
llmModel: process.env.LLM_MODEL || '',
|
||||
llmSystemPrompt: LLM_SYSTEM_PROMPT || '',
|
||||
});
|
||||
});
|
||||
|
||||
// Proxy OpenAI-compatible chat completions — keeps API key off the browser
|
||||
function httpPost(urlStr, reqHeaders, body) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const u = new URL(urlStr);
|
||||
const mod = u.protocol === 'https:' ? https : httpModule;
|
||||
const payload = JSON.stringify(body);
|
||||
const req = mod.request({
|
||||
hostname: u.hostname,
|
||||
port: u.port || (u.protocol === 'https:' ? 443 : 80),
|
||||
path: u.pathname + u.search,
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(payload), ...reqHeaders }
|
||||
}, (res) => {
|
||||
let data = '';
|
||||
res.on('data', d => data += d);
|
||||
res.on('end', () => {
|
||||
try { resolve(JSON.parse(data)); }
|
||||
catch (e) { reject(new Error('LLM non-JSON response: ' + data.slice(0, 300))); }
|
||||
});
|
||||
});
|
||||
req.on('error', reject);
|
||||
req.write(payload);
|
||||
req.end();
|
||||
});
|
||||
}
|
||||
|
||||
app.post('/api/llm/chat', async (req, res) => {
|
||||
const { messages = [], endpoint, model, systemPrompt } = req.body;
|
||||
const url = endpoint || process.env.LLM_ENDPOINT || 'http://localhost:11434/v1/chat/completions';
|
||||
const mdl = model || process.env.LLM_MODEL || 'llama3';
|
||||
const sysProm = systemPrompt || LLM_SYSTEM_PROMPT || '';
|
||||
const apiKey = process.env.LLM_API_KEY || '';
|
||||
|
||||
const allMessages = sysProm
|
||||
? [{ role: 'system', content: sysProm }, ...messages]
|
||||
: messages;
|
||||
|
||||
const headers = {};
|
||||
if (apiKey) headers['Authorization'] = `Bearer ${apiKey}`;
|
||||
|
||||
try {
|
||||
const result = await httpPost(url, headers, { model: mdl, messages: allMessages, stream: false });
|
||||
const reply = result.choices?.[0]?.message?.content?.trim() || '';
|
||||
res.json({ reply });
|
||||
} catch (err) {
|
||||
console.error('[llm] error:', err.message);
|
||||
res.status(502).json({ error: err.message });
|
||||
}
|
||||
});
|
||||
|
||||
app.get('/api/status', (req, res) => {
|
||||
res.json({
|
||||
connected: jibo.connected,
|
||||
sessionID: jibo.sessionID,
|
||||
angles: jibo.currentAngles,
|
||||
videoStreamActive: jibo.videoStreamActive
|
||||
});
|
||||
});
|
||||
|
||||
// Proxy Jibo's video/photo byte streams through the server
|
||||
app.get('/proxy/stream', (req, res) => {
|
||||
const { uri } = req.query;
|
||||
if (!uri || !uri.startsWith('/')) return res.status(400).json({ error: 'invalid uri' });
|
||||
proxyJiboStream(uri, res);
|
||||
});
|
||||
|
||||
app.get('/proxy/photo', (req, res) => {
|
||||
const { uri } = req.query;
|
||||
if (!uri || !uri.startsWith('/')) return res.status(400).json({ error: 'invalid uri' });
|
||||
proxyJiboFetch(uri, res);
|
||||
});
|
||||
|
||||
// ── HTTP + WebSocket server ───────────────────────────────────────────────────
|
||||
|
||||
const server = http.createServer(app);
|
||||
|
||||
const wss = new WebSocketServer({ server, path: '/ws' });
|
||||
wss.on('connection', (ws) => {
|
||||
jibo.addSubscriber(ws);
|
||||
ws.on('close', () => jibo.removeSubscriber(ws));
|
||||
ws.on('error', () => jibo.removeSubscriber(ws));
|
||||
});
|
||||
|
||||
server.listen(APP_PORT, () => {
|
||||
console.log(`Re-Commander running at http://localhost:${APP_PORT}`);
|
||||
jibo.connect();
|
||||
new WakewordWatcher((msg) => jibo._broadcastToClients(msg));
|
||||
});
|
||||
Reference in New Issue
Block a user