BACKFLIP BACKFLIP BACKFLIP BACKFLIP BACKFLIP BACKFLIP BACKFLIP BACKFLIP BACKFLIP BACKFLIP BACKFLIP BACKFLIP BACKFLIP
843 lines
28 KiB
JavaScript
843 lines
28 KiB
JavaScript
/* eslint-disable no-console */
|
|
|
|
const fs = require('fs');
|
|
const http = require('http');
|
|
const https = require('https');
|
|
const crypto = require('crypto');
|
|
let WebSocket;
|
|
|
|
function requireWs() {
|
|
try {
|
|
// Prefer a normal node resolution (server install, or local node_modules).
|
|
return require('ws');
|
|
} catch (_) {
|
|
// Fallback for on-robot usage where `ws` already exists under the skills runtime.
|
|
return require('/opt/jibo/Jibo/Skills/@be/be/node_modules/ws');
|
|
}
|
|
}
|
|
|
|
WebSocket = requireWs();
|
|
|
|
function nowMs() {
|
|
return Date.now();
|
|
}
|
|
|
|
function uuid() {
|
|
// Node 14+: crypto.randomUUID exists; otherwise fallback.
|
|
if (typeof crypto.randomUUID === 'function') return crypto.randomUUID();
|
|
return [4, 2, 2, 2, 6].map((len) => crypto.randomBytes(len).toString('hex')).join('-');
|
|
}
|
|
|
|
function loadJson(filePath) {
|
|
const raw = fs.readFileSync(filePath, 'utf8');
|
|
return JSON.parse(raw);
|
|
}
|
|
|
|
function sleep(ms) {
|
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
}
|
|
|
|
function logFactory(level) {
|
|
const order = { error: 0, warn: 1, info: 2, debug: 3 };
|
|
const threshold = order[level] ?? 2;
|
|
return {
|
|
error: (...args) => threshold >= 0 && console.error('[hub-shim]', ...args),
|
|
warn: (...args) => threshold >= 1 && console.warn('[hub-shim]', ...args),
|
|
info: (...args) => threshold >= 2 && console.log('[hub-shim]', ...args),
|
|
debug: (...args) => threshold >= 3 && console.log('[hub-shim]', ...args),
|
|
};
|
|
}
|
|
|
|
function normalizeText(text) {
|
|
return String(text || '').trim();
|
|
}
|
|
|
|
function classifyYesNo(text) {
|
|
const t = normalizeText(text).toLowerCase();
|
|
// Negative first to avoid matching "no" in "not"?? Keep it simple.
|
|
if (/\b(no|nope|nah|not really|don't|do not|didn't|did not)\b/.test(t)) return 'no';
|
|
if (/\b(yes|yep|yeah|ya|sure|ok|okay|i did|i do|i liked it|liked it|that was good|that was great|great|good|awesome|amazing)\b/.test(t)) return 'yes';
|
|
return '';
|
|
}
|
|
|
|
function buildAsrResult(text) {
|
|
return {
|
|
text,
|
|
confidence: 1,
|
|
// Keep these optional fields minimal.
|
|
alternates: [],
|
|
};
|
|
}
|
|
|
|
function buildNluResult(intent, rules = [], entities = {}, slotActionsOverride) {
|
|
const normalizedIntent = intent || '';
|
|
const outEntities = entities && typeof entities === 'object' ? { ...entities } : {};
|
|
|
|
// Many flows either use `intent` directly or compute firstGrammarTag from
|
|
// entities[slotActions[0]]. Keep defaults backwards compatible, but allow
|
|
// callers to provide explicit slotActions for launch-style grammars.
|
|
let slotActions = [];
|
|
if (Array.isArray(slotActionsOverride) && slotActionsOverride.length) {
|
|
slotActions = slotActionsOverride.map((s) => String(s)).filter(Boolean);
|
|
} else if (normalizedIntent) {
|
|
slotActions = [normalizedIntent];
|
|
if (outEntities[normalizedIntent] == null) outEntities[normalizedIntent] = normalizedIntent;
|
|
}
|
|
|
|
return {
|
|
rules: Array.isArray(rules) ? rules : [],
|
|
intent: normalizedIntent,
|
|
entities: outEntities,
|
|
slotActions,
|
|
source: 'hub-shim',
|
|
confidence: normalizedIntent ? 1 : 0,
|
|
};
|
|
}
|
|
|
|
function hasRule(rules, want) {
|
|
if (!Array.isArray(rules) || !want) return false;
|
|
const w = String(want).toLowerCase();
|
|
for (const r of rules) {
|
|
const s = String(r || '').toLowerCase();
|
|
if (!s) continue;
|
|
if (s === w) return true;
|
|
if (s.endsWith('/' + w)) return true;
|
|
// Common aliasing patterns.
|
|
if (w === 'launch' && (s.includes('global_commands_launch') || s.includes('commands_launch'))) return true;
|
|
if (w === 'dance' && s.includes('tutorial/dance')) return true;
|
|
if ((w === 'take_photo' || w === 'photo') && (s.includes('tutorial/take_photo') || s.includes('take_photo'))) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
function inferNluFromText(text, rules) {
|
|
const t = normalizeText(text).toLowerCase();
|
|
|
|
// No text → empty intent (ListenResultState.noInput → Mim re-prompts or times out).
|
|
if (!t) return buildNluResult('', rules, {});
|
|
|
|
// Yes/No detection — common across many MIM types.
|
|
const yn = classifyYesNo(text);
|
|
|
|
if (!Array.isArray(rules) || !rules.length) {
|
|
return yn ? buildNluResult(yn, rules, {}) : buildNluResult('', rules, {});
|
|
}
|
|
|
|
// ── Skill-specific rule matching (checked BEFORE global catch-all) ──
|
|
|
|
// introductions/recognition_type_menu: expects face, name, voice, all
|
|
if (hasRule(rules, 'recognition_type_menu')) {
|
|
if (/\bface\b/.test(t)) return buildNluResult('face', rules, {});
|
|
if (/\bname\b/.test(t)) return buildNluResult('name', rules, {});
|
|
if (/\bvoice\b/.test(t)) return buildNluResult('voice', rules, {});
|
|
if (/\b(all|everything|everyone)\b/.test(t)) return buildNluResult('all', rules, {});
|
|
if (yn) return buildNluResult(yn, rules, {});
|
|
return buildNluResult('', rules, {});
|
|
}
|
|
|
|
// introductions/voice_face_training_menu: similar recognition type choices
|
|
if (hasRule(rules, 'voice_face_training_menu')) {
|
|
if (/\bface\b/.test(t)) return buildNluResult('face', rules, {});
|
|
if (/\bname\b/.test(t)) return buildNluResult('name', rules, {});
|
|
if (/\bvoice\b/.test(t)) return buildNluResult('voice', rules, {});
|
|
if (/\b(all|everything)\b/.test(t)) return buildNluResult('all', rules, {});
|
|
if (yn) return buildNluResult(yn, rules, {});
|
|
return buildNluResult('', rules, {});
|
|
}
|
|
|
|
// introductions yes/no questions: face_capture_ready, did_i_hear_name,
|
|
// did_i_pronounce_name, any_more_intros, recognition_any_more
|
|
if (rules.some((r) => /face_capture|did_i_hear|did_i_pronounce|any_more|recognition_any_more/.test(r))) {
|
|
if (yn) return buildNluResult(yn, rules, {});
|
|
return buildNluResult('', rules, {});
|
|
}
|
|
|
|
// introductions/intro_looper: expects loopmember intent with loopMemberReferent entity.
|
|
// Without real NLU we cannot resolve the entity → return noMatch so the MIM re-prompts.
|
|
if (hasRule(rules, 'intro_looper')) {
|
|
if (yn) return buildNluResult(yn, rules, {});
|
|
return buildNluResult('', rules, {});
|
|
}
|
|
|
|
// main-menu/execute_main_menu: map spoken words to menu items.
|
|
if (hasRule(rules, 'execute_main_menu')) {
|
|
if (/\bintroduc/.test(t)) return buildNluResult('loadMenu', rules, { loadMenu: 'introductions' });
|
|
if (/\bsurprise/.test(t)) return buildNluResult('loadMenu', rules, { loadMenu: 'surprise-me' });
|
|
if (/\b(time|clock)\b/.test(t)) return buildNluResult('loadMenu', rules, { loadMenu: 'clock' });
|
|
if (/\bphoto\s*booth\b/.test(t)) return buildNluResult('loadMenu', rules, { loadMenu: 'photobooth' });
|
|
if (/\bgallery\b/.test(t)) return buildNluResult('loadMenu', rules, { loadMenu: 'gallery' });
|
|
if (/\b(exercise|workout)\b/.test(t)) return buildNluResult('loadMenu', rules, { loadMenu: 'exercise' });
|
|
if (/\b(radio|music)\b/.test(t)) return buildNluResult('loadMenu', rules, { loadMenu: 'radio' });
|
|
if (/\bsettings?\b/.test(t)) return buildNluResult('loadMenu', rules, { loadMenu: 'settings' });
|
|
if (/\btips?\b/.test(t)) return buildNluResult('loadMenu', rules, { loadMenu: 'tips-tricks' });
|
|
if (/\bfun\b/.test(t)) return buildNluResult('loadMenu', rules, { loadMenu: 'fun-stuff' });
|
|
if (/\bcreate\b/.test(t)) return buildNluResult('loadMenu', rules, { loadMenu: 'create' });
|
|
if (/\b(report|personal)\b/.test(t)) return buildNluResult('loadMenu', rules, { loadMenu: 'personal-report' });
|
|
// Fall through to generic patterns below.
|
|
}
|
|
|
|
// ── Generic patterns ──
|
|
|
|
// Yes/No (applies to any MIM type that accepts it)
|
|
if (yn) return buildNluResult(yn, rules, {});
|
|
|
|
// Dance / Photo (tutorial rules)
|
|
if (hasRule(rules, 'dance') && /\bdance\b/.test(t)) return buildNluResult('dance', rules, {});
|
|
if (hasRule(rules, 'take_photo') && /\b(photo|picture|take a photo|take a picture)\b/.test(t)) return buildNluResult('take_photo', rules, {});
|
|
|
|
// Global launch commands — only for specific well-known commands.
|
|
// Do NOT catch-all to chitchat; that breaks in-skill NLU.
|
|
if (hasRule(rules, 'launch')) {
|
|
if (/\bdance\b/.test(t)) return buildNluResult('launch', rules, { skill: 'dance', query: normalizeText(text) }, ['skill']);
|
|
if (/\b(photo|picture|take a photo|take a picture|selfie)\b/.test(t)) return buildNluResult('launch', rules, { skill: 'photobooth', query: normalizeText(text) }, ['skill']);
|
|
}
|
|
|
|
// Default: no match — lets the Mim framework re-prompt or handle noMatch.
|
|
return buildNluResult('', rules, {});
|
|
}
|
|
|
|
function httpJsonPostRaw(urlString, payload, timeoutMs) {
|
|
timeoutMs = typeof timeoutMs === 'number' ? timeoutMs : 6000;
|
|
const u = new URL(urlString);
|
|
const bodyStr = JSON.stringify(payload || {});
|
|
const body = Buffer.from(bodyStr, 'utf8');
|
|
const isHttps = u.protocol === 'https:';
|
|
const requestOptions = {
|
|
protocol: u.protocol,
|
|
hostname: u.hostname,
|
|
port: u.port || (isHttps ? 443 : 80),
|
|
path: u.pathname + (u.search || ''),
|
|
method: 'POST',
|
|
headers: {
|
|
'Content-Type': 'application/json',
|
|
'Content-Length': body.length,
|
|
},
|
|
timeout: timeoutMs,
|
|
};
|
|
|
|
return new Promise((resolve, reject) => {
|
|
const req = (isHttps ? https : http).request(requestOptions, (res) => {
|
|
const chunks = [];
|
|
res.on('data', (d) => chunks.push(d));
|
|
res.on('end', () => {
|
|
resolve({
|
|
statusCode: res.statusCode,
|
|
headers: res.headers || {},
|
|
body: Buffer.concat(chunks).toString('utf8'),
|
|
});
|
|
});
|
|
});
|
|
req.on('error', reject);
|
|
req.on('timeout', () => {
|
|
req.destroy(new Error('POST timeout'));
|
|
});
|
|
req.write(body);
|
|
req.end();
|
|
});
|
|
}
|
|
|
|
function readJsonBody(req, maxBytes = 256 * 1024) {
|
|
return new Promise((resolve, reject) => {
|
|
let total = 0;
|
|
const chunks = [];
|
|
req.on('data', (d) => {
|
|
total += d.length || 0;
|
|
if (total > maxBytes) {
|
|
reject(new Error('request body too large'));
|
|
try { req.destroy(); } catch (_) { /* ignore */ }
|
|
return;
|
|
}
|
|
chunks.push(d);
|
|
});
|
|
req.on('end', () => {
|
|
const raw = Buffer.concat(chunks).toString('utf8');
|
|
if (!raw) return resolve({});
|
|
try {
|
|
resolve(JSON.parse(raw));
|
|
} catch (e) {
|
|
reject(new Error('invalid json body'));
|
|
}
|
|
});
|
|
req.on('error', reject);
|
|
});
|
|
}
|
|
|
|
async function ollamaAnswer({ baseUrl, model, system, prompt, timeoutMs }) {
|
|
baseUrl = String(baseUrl || 'http://127.0.0.1:11434').replace(/\/+$/, '');
|
|
model = String(model || 'llama3');
|
|
system = typeof system === 'string' ? system : 'Answer conversationally and concisely.';
|
|
prompt = String(prompt || '').trim();
|
|
if (!prompt) return '';
|
|
|
|
// Prefer /api/generate; fallback to /api/chat.
|
|
try {
|
|
const r = await httpJsonPostRaw(`${baseUrl}/api/generate`, {
|
|
model,
|
|
system,
|
|
prompt,
|
|
stream: false,
|
|
}, timeoutMs);
|
|
if (r.statusCode >= 200 && r.statusCode < 300) {
|
|
const obj = JSON.parse(r.body || '{}');
|
|
return String(obj.response || '').trim();
|
|
}
|
|
} catch (_) {
|
|
// fall through
|
|
}
|
|
|
|
const r2 = await httpJsonPostRaw(`${baseUrl}/api/chat`, {
|
|
model,
|
|
stream: false,
|
|
messages: [
|
|
{ role: 'system', content: system },
|
|
{ role: 'user', content: prompt },
|
|
],
|
|
}, timeoutMs);
|
|
if (!(r2.statusCode >= 200 && r2.statusCode < 300)) {
|
|
throw new Error(`ollama http ${r2.statusCode}`);
|
|
}
|
|
const obj2 = JSON.parse(r2.body || '{}');
|
|
return String(obj2.message && obj2.message.content ? obj2.message.content : '').trim();
|
|
}
|
|
|
|
function createGqaShim(config, logger) {
|
|
const gqa = (config && config.gqaShim) || {};
|
|
const enabled = !!gqa.enabled || String(process.env.JIBO_GQA_SHIM_ENABLE || '').toLowerCase() === '1' || String(process.env.JIBO_GQA_SHIM_ENABLE || '').toLowerCase() === 'true';
|
|
if (!enabled) return null;
|
|
|
|
const bindHost = gqa.bindHost || process.env.JIBO_GQA_SHIM_BIND_HOST || '0.0.0.0';
|
|
const port = Number(gqa.port || process.env.JIBO_GQA_SHIM_PORT || 8080);
|
|
const timeoutMs = Number(gqa.timeoutMs || process.env.JIBO_GQA_SHIM_TIMEOUT_MS || 20000);
|
|
const ollama = (gqa.ollama || {});
|
|
const ollamaBaseUrl = ollama.baseUrl || process.env.OLLAMA_BASE_URL || 'http://127.0.0.1:11434';
|
|
const ollamaModel = ollama.model || process.env.OLLAMA_MODEL || 'llama3';
|
|
const systemPrompt = ollama.systemPrompt || process.env.OLLAMA_SYSTEM_PROMPT || 'You are Jibo, a friendly social robot. Reply in 1-2 short spoken sentences.';
|
|
|
|
const server = http.createServer(async (req, res) => {
|
|
try {
|
|
if (req.method === 'GET') {
|
|
res.writeHead(200, { 'content-type': 'text/plain' });
|
|
res.end('jibo-gqa-shim\n');
|
|
return;
|
|
}
|
|
if (req.method !== 'POST') {
|
|
res.writeHead(405, { 'content-type': 'application/json' });
|
|
res.end(JSON.stringify({ message: 'method not allowed' }));
|
|
return;
|
|
}
|
|
|
|
const target = String((req.headers && (req.headers['x-amz-target'] || req.headers['X-Amz-Target'])) || '').trim();
|
|
const op = target.includes('.') ? target.split('.').pop() : '';
|
|
const body = await readJsonBody(req);
|
|
|
|
if (op === 'Question') {
|
|
const input = (body && (body.Input || body.input)) ? String(body.Input || body.input) : '';
|
|
logger.info('gqa Question', { input: input.slice(0, 120) });
|
|
let answer = '';
|
|
try {
|
|
answer = await ollamaAnswer({
|
|
baseUrl: ollamaBaseUrl,
|
|
model: ollamaModel,
|
|
system: systemPrompt,
|
|
prompt: input,
|
|
timeoutMs,
|
|
});
|
|
} catch (e) {
|
|
logger.warn('ollama failed', { err: String(e && (e.stack || e.message || e)) });
|
|
answer = '';
|
|
}
|
|
|
|
const ok = !!answer;
|
|
const resp = {
|
|
success: ok,
|
|
source: ok ? 'MOCK' : 'No Match',
|
|
message: ok ? undefined : 'No response',
|
|
type: ok ? 'answer' : 'error',
|
|
timestamps: {},
|
|
response: {
|
|
type: 'string',
|
|
payload: answer,
|
|
},
|
|
version: '0.0.1',
|
|
};
|
|
res.writeHead(200, { 'content-type': 'application/x-amz-json-1.0' });
|
|
res.end(JSON.stringify(resp));
|
|
return;
|
|
}
|
|
|
|
if (op === 'ListAttribution') {
|
|
const resp = { success: true, attributions: [], timestamps: {}, version: '0.0.1' };
|
|
res.writeHead(200, { 'content-type': 'application/x-amz-json-1.0' });
|
|
res.end(JSON.stringify(resp));
|
|
return;
|
|
}
|
|
|
|
res.writeHead(400, { 'content-type': 'application/x-amz-json-1.0' });
|
|
res.end(JSON.stringify({ message: 'unknown operation', target }));
|
|
} catch (e) {
|
|
res.writeHead(500, { 'content-type': 'application/x-amz-json-1.0' });
|
|
res.end(JSON.stringify({ message: String(e && (e.message || e)) }));
|
|
}
|
|
});
|
|
|
|
logger.info('gqa http', { bindHost, port, ollamaBaseUrl, ollamaModel });
|
|
|
|
return {
|
|
start: () => new Promise((resolve, reject) => server.listen(port, bindHost, (err) => (err ? reject(err) : resolve()))),
|
|
stop: () => new Promise((resolve) => server.close(() => resolve())),
|
|
};
|
|
}
|
|
|
|
function pickBestAsrUtterance(utterances) {
|
|
try {
|
|
if (!utterances || !utterances.length) return '';
|
|
let bestText = '';
|
|
let bestScore = -1e99;
|
|
for (const u of utterances) {
|
|
let text = u && (u.utterance || u.Utterance || u.text) ? String(u.utterance || u.Utterance || u.text) : '';
|
|
text = text.trim();
|
|
if (!text) continue;
|
|
if (text.length >= 2 && text[0].toLowerCase && text[1].toLowerCase && text[0].toLowerCase() === text[1].toLowerCase()) {
|
|
text = text.slice(1);
|
|
}
|
|
const score = typeof u.score === 'number' ? u.score : (typeof u.Score === 'number' ? u.Score : 0);
|
|
if (!bestText || score > bestScore) {
|
|
bestText = text;
|
|
bestScore = score;
|
|
}
|
|
}
|
|
return bestText;
|
|
} catch (_) {
|
|
return '';
|
|
}
|
|
}
|
|
|
|
function extractFinalTextFromAsrEvent(evt) {
|
|
// jibo-asr-service has had multiple JSON shapes across builds. Prefer utterances,
|
|
// but accept common fallback fields.
|
|
try {
|
|
const utterances = evt.utterances || evt.Utterances || (evt.payload && (evt.payload.utterances || evt.payload.Utterances));
|
|
const best = pickBestAsrUtterance(utterances);
|
|
if (best && String(best).trim()) return String(best).trim();
|
|
|
|
const candidates = [
|
|
evt.text,
|
|
evt.transcript,
|
|
evt.utterance,
|
|
evt.Utterance,
|
|
evt.payload && (evt.payload.text || evt.payload.transcript || evt.payload.utterance || evt.payload.Utterance),
|
|
];
|
|
for (const c of candidates) {
|
|
if (typeof c === 'string' && c.trim()) return c.trim();
|
|
}
|
|
return '';
|
|
} catch (_) {
|
|
return '';
|
|
}
|
|
}
|
|
|
|
async function asrServiceSttOnce(asrBaseUrl, wsPath, timeoutMs, audioSourceId, logger) {
|
|
// jibo-asr-service protocol:
|
|
// 1) Connect to WS /simple_port (event stream)
|
|
// 2) HTTP POST /asr_simple_interface {command:'start', task_id, audio_source_id, ...}
|
|
// 3) Wait for event_type=='speech_to_text_final' and extract utterance
|
|
// 4) HTTP POST /asr_simple_interface {command:'stop', task_id}
|
|
const url = new URL(asrBaseUrl);
|
|
const wsUrl = `${url.protocol === 'https:' ? 'wss' : 'ws'}://${url.host}${wsPath}`;
|
|
const baseHttp = `${url.protocol}//${url.host}`;
|
|
const taskId = `hub-shim-${Date.now()}-${Math.floor(Math.random() * 1e9)}`;
|
|
const requestId = `stt_start_${Date.now()}_${Math.floor(Math.random() * 1e9)}`;
|
|
logger.debug('asr connect', { wsUrl, baseHttp, taskId });
|
|
|
|
return await new Promise((resolve, reject) => {
|
|
const ws = new WebSocket(wsUrl);
|
|
let done = false;
|
|
const t0 = nowMs();
|
|
let timer;
|
|
|
|
function finish(err, value) {
|
|
if (done) return;
|
|
done = true;
|
|
if (timer) clearTimeout(timer);
|
|
try {
|
|
ws.close();
|
|
} catch (_) {
|
|
// ignore
|
|
}
|
|
if (err) reject(err);
|
|
else resolve(value);
|
|
}
|
|
|
|
async function stopAlways() {
|
|
const stopPayload = {
|
|
command: 'stop',
|
|
task_id: taskId,
|
|
request_id: `stt_stop_${Date.now()}_${Math.floor(Math.random() * 1e9)}`,
|
|
};
|
|
try {
|
|
await httpJsonPostRaw(`${baseHttp}/asr_simple_interface`, stopPayload, 6000);
|
|
} catch (_) {
|
|
// ignore
|
|
}
|
|
}
|
|
|
|
timer = setTimeout(() => {
|
|
stopAlways().finally(() => {
|
|
finish(new Error(`asr timeout after ${timeoutMs}ms`));
|
|
});
|
|
}, timeoutMs);
|
|
|
|
ws.on('open', async () => {
|
|
const startPayload = {
|
|
command: 'start',
|
|
task_id: taskId,
|
|
audio_source_id: String(audioSourceId || 'alsa1'),
|
|
hotphrase: 'none',
|
|
speech_to_text: true,
|
|
request_id: requestId,
|
|
};
|
|
try {
|
|
const resp = await httpJsonPostRaw(`${baseHttp}/asr_simple_interface`, startPayload, 8000);
|
|
if (!resp || !resp.statusCode || resp.statusCode < 200 || resp.statusCode >= 300) {
|
|
await stopAlways();
|
|
finish(new Error(`asr start failed: HTTP ${resp && resp.statusCode} ${resp && resp.body ? resp.body.slice(0, 200) : ''}`));
|
|
return;
|
|
}
|
|
logger.info('asr start', { ms: nowMs() - t0, status: resp.statusCode });
|
|
} catch (e) {
|
|
await stopAlways();
|
|
finish(e);
|
|
}
|
|
});
|
|
|
|
ws.on('message', (data) => {
|
|
let evt;
|
|
try {
|
|
evt = JSON.parse(data.toString('utf8'));
|
|
} catch (_) {
|
|
return;
|
|
}
|
|
if (!evt) return;
|
|
|
|
// Legacy/simple protocols: {type:'final', text:'...'}
|
|
const isFinal = evt.type === 'final' || evt.final === true || evt.isFinal === true;
|
|
const simpleText = evt.text || evt.transcript;
|
|
if (isFinal && typeof simpleText === 'string') {
|
|
stopAlways().finally(() => {
|
|
logger.info('asr final(simple)', { ms: nowMs() - t0, text: String(simpleText).slice(0, 80) });
|
|
finish(null, normalizeText(simpleText));
|
|
});
|
|
return;
|
|
}
|
|
|
|
const eventType = evt.event_type || evt.eventType || evt.event || evt.type;
|
|
if (eventType !== 'speech_to_text_final') return;
|
|
|
|
// Correlate ids when possible, but do not hard-require a match.
|
|
const evTaskId = evt.task_id || evt.taskId || (evt.payload && (evt.payload.task_id || evt.payload.taskId));
|
|
const evRequestId = evt.request_id || evt.requestId || (evt.payload && (evt.payload.request_id || evt.payload.requestId));
|
|
const idMatches = (!evTaskId && !evRequestId) || (evTaskId && String(evTaskId) === String(taskId)) || (evRequestId && String(evRequestId) === String(requestId));
|
|
if (!idMatches) {
|
|
logger.debug('asr final id mismatch (accepting)', {
|
|
taskId,
|
|
requestId,
|
|
evTaskId,
|
|
evRequestId,
|
|
});
|
|
}
|
|
|
|
const finalText = extractFinalTextFromAsrEvent(evt);
|
|
if (!finalText) {
|
|
logger.debug('asr final but no text extracted', {
|
|
keys: Object.keys(evt || {}).slice(0, 30),
|
|
payloadKeys: evt && evt.payload ? Object.keys(evt.payload).slice(0, 30) : undefined,
|
|
});
|
|
return;
|
|
}
|
|
|
|
stopAlways().finally(() => {
|
|
logger.info('asr final', { ms: nowMs() - t0, text: String(finalText).slice(0, 80) });
|
|
finish(null, normalizeText(finalText));
|
|
});
|
|
});
|
|
|
|
ws.on('error', async (e) => {
|
|
await stopAlways();
|
|
finish(e);
|
|
});
|
|
|
|
ws.on('close', () => {
|
|
if (!done) {
|
|
finish(new Error('asr ws closed before final'));
|
|
}
|
|
});
|
|
});
|
|
}
|
|
|
|
function send(ws, obj) {
|
|
ws.send(JSON.stringify(obj));
|
|
}
|
|
|
|
function extractTransIdFromHeaders(req) {
|
|
// Jetstream sets an x-jibo-transid header (see @jibo/interfaces Headers.transID).
|
|
const h = req.headers || {};
|
|
return h['x-jibo-transid'] || h['X-Jibo-TransId'] || h['x-jibo-transId'] || h['x-jibo-transID'] || '';
|
|
}
|
|
|
|
function createHubShim(configPath) {
|
|
const config = loadJson(configPath);
|
|
const logger = logFactory(config?.logging?.level || 'info');
|
|
|
|
const listen = config.listen || {};
|
|
const port = Number(listen.port || 9000);
|
|
const bindHost = listen.bindHost || '0.0.0.0';
|
|
const path = listen.path || '/v1/listen';
|
|
|
|
const asr = config.asrService || {};
|
|
const asrBaseUrl = asr.baseUrl || 'http://127.0.0.1:8088';
|
|
const wsPath = asr.wsPath || '/simple_port';
|
|
const timeoutMs = Number(asr.timeoutMs || 15000);
|
|
const audioSourceId = asr.audioSourceId || 'alsa1';
|
|
|
|
// The robot ASR service is effectively a shared hardware resource.
|
|
// Jetstream may open multiple WS connections/listens concurrently; serialize STT
|
|
// to avoid crosstalk and reduce timeouts.
|
|
let asrQueue = Promise.resolve();
|
|
function runAsrQueued(fn) {
|
|
const queuedAt = nowMs();
|
|
const run = async () => {
|
|
const waited = nowMs() - queuedAt;
|
|
if (waited > 50) logger.debug('asr queue wait', { waitedMs: waited });
|
|
return await fn();
|
|
};
|
|
const p = asrQueue.then(run, run);
|
|
asrQueue = p.catch(() => undefined);
|
|
return p;
|
|
}
|
|
|
|
const server = http.createServer((req, res) => {
|
|
res.writeHead(200, { 'content-type': 'text/plain' });
|
|
res.end('jibo-hub-shim\n');
|
|
});
|
|
|
|
const wss = new WebSocket.Server({ server, path });
|
|
logger.info('listen ws', { bindHost, port, path, asrBaseUrl, audioSourceId });
|
|
|
|
const gqaShim = createGqaShim(config, logger);
|
|
|
|
wss.on('connection', (ws, req) => {
|
|
const connId = uuid().slice(0, 8);
|
|
const transID = extractTransIdFromHeaders(req);
|
|
logger.info('ws connected', { connId, transID: transID || undefined });
|
|
|
|
let binaryFrames = 0;
|
|
let binaryBytes = 0;
|
|
|
|
let lastContext = null;
|
|
let lastContextMsg = null;
|
|
let pendingListen = null;
|
|
|
|
async function maybeHandleListen() {
|
|
if (!lastContext || !pendingListen) return;
|
|
const listenMsg = pendingListen;
|
|
const mode = listenMsg?.data?.mode;
|
|
|
|
// If the client is responsible for supplying ASR or NLU, wait until we receive it.
|
|
if (mode === 'CLIENT_ASR' && !listenMsg._clientAsrText) return;
|
|
if (mode === 'CLIENT_NLU' && !listenMsg._clientNlu) return;
|
|
pendingListen = null;
|
|
|
|
// Derive transID: echo back the transID the jetstream-service sent.
|
|
// It may appear on either the LISTEN or CONTEXT message.
|
|
const msgTransID = listenMsg.transID || (lastContextMsg && lastContextMsg.transID) || transID || '';
|
|
|
|
const t0 = nowMs();
|
|
|
|
let text = '';
|
|
try {
|
|
if (mode === 'CLIENT_ASR') {
|
|
text = normalizeText(listenMsg._clientAsrText);
|
|
} else if (mode === 'CLIENT_NLU') {
|
|
// No ASR needed for client-supplied NLU.
|
|
} else {
|
|
text = await runAsrQueued(() => asrServiceSttOnce(asrBaseUrl, wsPath, timeoutMs, audioSourceId, logger));
|
|
}
|
|
} catch (e) {
|
|
logger.warn('asr failed', { connId, err: String(e && (e.stack || e.message || e)) });
|
|
// Still send an empty listen result.
|
|
}
|
|
|
|
const rules = Array.isArray(listenMsg?.data?.rules) ? listenMsg.data.rules : [];
|
|
const nluRes =
|
|
(mode === 'CLIENT_NLU' && listenMsg._clientNlu)
|
|
? listenMsg._clientNlu
|
|
: ((config?.nlu?.enabled === false) ? buildNluResult('', rules, {}) : inferNluFromText(text, rules));
|
|
const asrRes = buildAsrResult(text);
|
|
|
|
// Build the match object (mirrors what the cloud hub returns).
|
|
const skillID = (lastContext && lastContext.skill && lastContext.skill.skillID)
|
|
? lastContext.skill.skillID
|
|
: (lastContext && typeof lastContext.skill === 'string' ? lastContext.skill : '');
|
|
const matchObj = { onRobot: true };
|
|
if (skillID) matchObj.skillID = skillID;
|
|
|
|
logger.info('listen result', {
|
|
connId,
|
|
transID: msgTransID || undefined,
|
|
text: String(text || '').slice(0, 120),
|
|
intent: nluRes && nluRes.intent,
|
|
slot0: nluRes && Array.isArray(nluRes.slotActions) ? nluRes.slotActions[0] : undefined,
|
|
skill: nluRes && nluRes.entities ? nluRes.entities.skill : undefined,
|
|
rule0: Array.isArray(rules) ? rules[0] : undefined,
|
|
rulesCount: Array.isArray(rules) ? rules.length : 0,
|
|
rules: Array.isArray(rules) ? rules.slice(0, 6) : [],
|
|
});
|
|
|
|
// Send a local TURN_RESULT (not just LISTEN) so the skill's local turn resolves.
|
|
const turnResult = {
|
|
type: 'TURN_RESULT',
|
|
msgID: listenMsg.msgID || uuid(),
|
|
transID: msgTransID,
|
|
ts: nowMs(),
|
|
requestID: msgTransID, // local turn uses transID as requestID
|
|
data: {
|
|
status: 'SUCCEEDED',
|
|
global: false,
|
|
result: {
|
|
asr: asrRes,
|
|
nlu: nluRes,
|
|
match: matchObj,
|
|
},
|
|
},
|
|
final: true,
|
|
};
|
|
send(ws, turnResult);
|
|
}
|
|
|
|
ws.on('message', async (data, isBinary) => {
|
|
if (isBinary) {
|
|
binaryFrames += 1;
|
|
try {
|
|
binaryBytes += (data && (data.length || data.byteLength)) || 0;
|
|
} catch (_) {
|
|
// ignore
|
|
}
|
|
if (binaryFrames <= 3 || (binaryFrames % 50) === 0) {
|
|
logger.debug('rx binary', { connId, frames: binaryFrames, bytes: binaryBytes });
|
|
}
|
|
// Ignore audio/binary frames; this shim does not decode audio.
|
|
return;
|
|
}
|
|
|
|
let msg;
|
|
try {
|
|
msg = JSON.parse(data.toString('utf8'));
|
|
} catch (e) {
|
|
logger.warn('bad json', { connId, sample: String(data || '').slice(0, 120) });
|
|
return;
|
|
}
|
|
|
|
if (!msg || typeof msg.type !== 'string') return;
|
|
logger.debug('rx', { connId, type: msg.type });
|
|
|
|
switch (msg.type) {
|
|
case 'CONTEXT':
|
|
lastContext = msg.data;
|
|
lastContextMsg = msg;
|
|
logger.debug('context', {
|
|
connId,
|
|
transID: msg.transID || undefined,
|
|
hasSkill: !!(msg.data && msg.data.skill),
|
|
hasRuntime: !!(msg.data && msg.data.runtime),
|
|
});
|
|
break;
|
|
case 'LISTEN':
|
|
pendingListen = msg;
|
|
logger.debug('listen req', {
|
|
connId,
|
|
transID: msg.transID || undefined,
|
|
hotphrase: !!(msg.data && msg.data.hotphrase),
|
|
mode: msg.data && msg.data.mode,
|
|
rules: Array.isArray(msg.data && msg.data.rules) ? msg.data.rules : [],
|
|
});
|
|
break;
|
|
case 'CLIENT_ASR':
|
|
// Accept client-provided text (requires a LISTEN message too).
|
|
if (!pendingListen) {
|
|
pendingListen = { type: 'LISTEN', msgID: uuid(), transID: msg.transID, ts: nowMs(), data: { rules: [], mode: 'CLIENT_ASR' } };
|
|
}
|
|
pendingListen._clientAsrText = msg.data?.text;
|
|
break;
|
|
case 'CLIENT_NLU':
|
|
if (!pendingListen) {
|
|
pendingListen = { type: 'LISTEN', msgID: uuid(), transID: msg.transID, ts: nowMs(), data: { rules: [], mode: 'CLIENT_NLU' } };
|
|
}
|
|
pendingListen._clientNlu = msg.data;
|
|
break;
|
|
default:
|
|
// TRIGGER/proactive not implemented yet.
|
|
break;
|
|
}
|
|
|
|
// Normal path: wait for both CONTEXT and LISTEN then handle.
|
|
try {
|
|
await maybeHandleListen();
|
|
} catch (e) {
|
|
logger.warn('listen handler error', { connId, err: String(e && (e.stack || e.message || e)) });
|
|
}
|
|
});
|
|
|
|
ws.on('close', () => {
|
|
logger.info('ws closed', { connId });
|
|
});
|
|
|
|
ws.on('error', (e) => {
|
|
logger.warn('ws error', { connId, err: String(e && (e.stack || e.message || e)) });
|
|
});
|
|
});
|
|
|
|
return {
|
|
start: () => new Promise((resolve, reject) => {
|
|
server.listen(port, bindHost, async (err) => {
|
|
if (err) return reject(err);
|
|
try {
|
|
if (gqaShim) await gqaShim.start();
|
|
resolve();
|
|
} catch (e) {
|
|
reject(e);
|
|
}
|
|
});
|
|
}),
|
|
stop: () => new Promise((resolve) => {
|
|
const done = async () => {
|
|
try {
|
|
if (gqaShim) await gqaShim.stop();
|
|
} finally {
|
|
resolve();
|
|
}
|
|
};
|
|
server.close(() => { done().catch(() => resolve()); });
|
|
}),
|
|
config,
|
|
};
|
|
}
|
|
|
|
async function main() {
|
|
const configPath = process.argv[2] || process.env.JIBO_HUB_SHIM_CONFIG || './config.json';
|
|
if (!fs.existsSync(configPath)) {
|
|
console.error('Config file not found:', configPath);
|
|
process.exit(2);
|
|
}
|
|
const shim = createHubShim(configPath);
|
|
await shim.start();
|
|
// Keep process alive.
|
|
for (;;) await sleep(60_000);
|
|
}
|
|
|
|
if (require.main === module) {
|
|
main().catch((e) => {
|
|
console.error('[hub-shim] fatal:', e && (e.stack || e.message || e));
|
|
process.exit(1);
|
|
});
|
|
}
|