Broken AI server Part 1

its 5:30 in the morning
2026-03-19 05:34:25 +02:00
parent 5c706a13b4
commit ba50c0fc08
43 changed files with 6726 additions and 0 deletions
--- a/V3.1/build/ai_bridge_server/README.md
+++ b/V3.1/build/ai_bridge_server/README.md
@@ -0,0 +1,46 @@
+# Jibo AI Bridge Server
+
+This is a small companion server you run on your PC (same machine as Ollama).
+It gives the robot a stable HTTP target and keeps the on-robot code modular.
+
+## Endpoints
+
+- `POST /v1/chat/text` JSON: `{ "text": "..." }` → `{ "reply": "..." }`
+- `POST /v1/chat/audio` JSON: `{ "wav_base64": "..." }` → `{ "reply": "...", "text": "<transcript>" }`
+
+## Requirements
+
+- Python 3.9+
+- Ollama running locally
+  - default Ollama chat URL: `http://127.0.0.1:11434/api/chat`
+
+Optional (for AUDIO mode):
+- `faster-whisper` + `ffmpeg`
+
+## Run
+
+From this folder:
+
+- `python3 server.py --host 0.0.0.0 --port 8020`
+
+Environment variables (optional):
+
+- `OLLAMA_MODEL` (default `phi3.5`)
+- `OLLAMA_URL` (default `http://127.0.0.1:11434/api/chat`)
+- `WHISPER_MODEL` (default `base`)
+
+Note: Ollama can stay bound to `127.0.0.1:11434` on your PC; the robot only talks to this bridge server (`:8020`).
+
+Install optional STT deps:
+
+- `pip install faster-whisper`
+- install `ffmpeg` (platform-specific)
+
+## Robot configuration
+
+Open the tunables UI (`http://<robot-ip>:3333`) and set:
+
+- **Jibo AI Bridge → Server URL**: `http://<your-pc-ip>:8020`
+- **Jibo AI Bridge → Input**:
+  - `AUDIO` (records a short WAV clip on the robot and sends it)
+  - `TEXT` (uses `globalTurnResult` ASR text if available)
--- a/V3.1/build/ai_bridge_server/pycache/server.cpython-313.pyc
+++ b/V3.1/build/ai_bridge_server/pycache/server.cpython-313.pyc
--- a/V3.1/build/ai_bridge_server/jibo_last.wav
+++ b/V3.1/build/ai_bridge_server/jibo_last.wav
--- a/V3.1/build/ai_bridge_server/server.py
+++ b/V3.1/build/ai_bridge_server/server.py
@@ -0,0 +1,341 @@
+#!/usr/bin/env python3
+"""Minimal AI Bridge server for Jibo.
+
+Endpoints:
+- POST /v1/chat/text  {"text": "..."} -> {"reply": "..."}
+- POST /v1/chat/audio {"wav_base64": "...", "sample_rate": 16000} -> {"reply": "...", "text": "<transcript>"}
+
+LLM:
+- Uses Ollama Chat API by default: http://localhost:11434/api/chat
+  Env:
+    OLLAMA_URL   (default: http://127.0.0.1:11434/api/chat)
+        OLLAMA_MODEL (default: phi3.5)
+
+STT (optional, for /audio):
+- If `faster-whisper` is installed, it will be used.
+  Env:
+    WHISPER_MODEL (default: base)
+
+Run:
+  python3 server.py --host 0.0.0.0 --port 8020
+"""
+
+from __future__ import annotations
+
+import argparse
+import array
+import base64
+import io
+import json
+import os
+import tempfile
+import time
+import traceback
+import wave
+from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
+from urllib.request import Request, urlopen
+
+
+def _ts() -> str:
+    # ISO-ish timestamp in local time; good enough for debugging
+    return time.strftime("%Y-%m-%d %H:%M:%S")
+
+
+def _log(msg: str):
+    # Server previously muted logs; we want visibility while debugging.
+    print(f"[{_ts()}] {msg}", flush=True)
+
+
+def _json_response(handler: BaseHTTPRequestHandler, status: int, payload: dict):
+    body = json.dumps(payload).encode("utf-8")
+    handler.send_response(status)
+    handler.send_header("Content-Type", "application/json")
+    handler.send_header("Content-Length", str(len(body)))
+    handler.end_headers()
+    handler.wfile.write(body)
+
+
+def _read_json(handler: BaseHTTPRequestHandler) -> dict:
+    length = int(handler.headers.get("Content-Length", "0"))
+    raw = handler.rfile.read(length) if length else b"{}"
+    return json.loads(raw.decode("utf-8"))
+
+
+def _ollama_chat(user_text: str) -> str:
+    ollama_url = os.environ.get("OLLAMA_URL", "http://127.0.0.1:11434/api/chat")
+    model = os.environ.get("OLLAMA_MODEL", "phi3.5")
+
+    req_body = {
+        "model": model,
+        "stream": False,
+        "messages": [
+            {"role": "system", "content": "You are Jibo, a friendly home robot. Keep replies short and spoken."},
+            {"role": "user", "content": user_text},
+        ],
+    }
+
+    req = Request(
+        ollama_url,
+        data=json.dumps(req_body).encode("utf-8"),
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+
+    with urlopen(req, timeout=60) as resp:
+        data = json.loads(resp.read().decode("utf-8"))
+
+    msg = data.get("message") or {}
+    content = msg.get("content")
+    if not content:
+        raise RuntimeError(f"Unexpected Ollama response: {data}")
+    return content.strip()
+
+
+def _wav_diagnostics(wav_bytes: bytes) -> dict:
+    """Best-effort WAV parsing + signal stats for debugging mic capture."""
+    info: dict = {"bytes": len(wav_bytes)}
+    try:
+        with wave.open(io.BytesIO(wav_bytes), "rb") as wf:  # type: ignore[name-defined]
+            nch = wf.getnchannels()
+            sw = wf.getsampwidth()
+            fr = wf.getframerate()
+            nframes = wf.getnframes()
+            info.update({"channels": nch, "sample_width": sw, "frame_rate": fr, "frames": nframes})
+
+            # Read up to ~3 seconds of audio for stats (avoid huge CPU)
+            max_frames = min(nframes, fr * 3)
+            frames = wf.readframes(max_frames)
+    except Exception as e:
+        info["parse_error"] = str(e)
+        return info
+
+    # Only compute stats for 16-bit PCM (most common).
+    if info.get("sample_width") != 2:
+        return info
+
+    try:
+        samples = array.array("h")
+        samples.frombytes(frames)
+        if not samples:
+            return info
+
+        mn = min(samples)
+        mx = max(samples)
+        zeros = sum(1 for s in samples if s == 0)
+        # RMS over interleaved samples (good enough for quick signal presence)
+        n = float(len(samples))
+        rms = (sum(float(s) * float(s) for s in samples) / n) ** 0.5
+        # Per-channel RMS (helps debug mic arrays)
+        ch = int(info.get("channels") or 1)
+        channel_rms = None
+        if ch > 1:
+            channel_rms = []
+            for c in range(ch):
+                chan = samples[c::ch]
+                if not chan:
+                    channel_rms.append(0.0)
+                else:
+                    nn = float(len(chan))
+                    channel_rms.append((sum(float(s) * float(s) for s in chan) / nn) ** 0.5)
+        info.update(
+            {
+                "min": int(mn),
+                "max": int(mx),
+                "rms": float(rms),
+                "zero_frac": float(zeros) / n,
+                "channel_rms": channel_rms,
+            }
+        )
+    except Exception as e:
+        info["stats_error"] = str(e)
+    return info
+
+
+def _to_loudest_channel_mono_wav(wav_bytes: bytes) -> tuple[bytes, dict]:
+    """If WAV is multi-channel 16-bit PCM, pick loudest channel and return mono WAV bytes."""
+    try:
+        with wave.open(io.BytesIO(wav_bytes), "rb") as wf:
+            nch = wf.getnchannels()
+            sw = wf.getsampwidth()
+            fr = wf.getframerate()
+            nframes = wf.getnframes()
+            frames = wf.readframes(nframes)
+    except Exception as e:
+        return wav_bytes, {"convert_error": str(e)}
+
+    if nch <= 1 or sw != 2:
+        return wav_bytes, {"converted": False, "channels": nch, "sample_width": sw}
+
+    samples = array.array("h")
+    samples.frombytes(frames)
+    if not samples:
+        return wav_bytes, {"converted": False, "reason": "empty_samples"}
+
+    # Choose loudest channel by RMS
+    rms_list: list[float] = []
+    for c in range(nch):
+        chan = samples[c::nch]
+        if not chan:
+            rms_list.append(0.0)
+        else:
+            nn = float(len(chan))
+            rms_list.append((sum(float(s) * float(s) for s in chan) / nn) ** 0.5)
+    best = max(range(nch), key=lambda i: rms_list[i])
+    mono = samples[best::nch]
+
+    out = io.BytesIO()
+    with wave.open(out, "wb") as ow:
+        ow.setnchannels(1)
+        ow.setsampwidth(2)
+        ow.setframerate(fr)
+        ow.writeframes(mono.tobytes())
+
+    return out.getvalue(), {"converted": True, "picked_channel": best, "channel_rms": rms_list, "frame_rate": fr}
+
+
+class _Whisper:
+    def __init__(self):
+        self._model = None
+
+    def available(self) -> bool:
+        try:
+            import faster_whisper  # noqa: F401
+
+            return True
+        except Exception:
+            return False
+
+    def transcribe_wav_bytes(self, wav_bytes: bytes) -> str:
+        try:
+            from faster_whisper import WhisperModel
+        except Exception as e:
+            raise RuntimeError(
+                "Audio mode requires `faster-whisper` (pip install faster-whisper) and ffmpeg on your PC"
+            ) from e
+
+        model_name = os.environ.get("WHISPER_MODEL", "base")
+        if self._model is None:
+            # CPU-friendly default; user can override via WHISPER_MODEL and faster-whisper params if needed.
+            self._model = WhisperModel(model_name, device="cpu", compute_type="int8")
+
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as f:
+            f.write(wav_bytes)
+            f.flush()
+            segments, info = self._model.transcribe(f.name)
+            text = "".join(seg.text for seg in segments).strip()
+            return text
+
+
+_whisper = _Whisper()
+
+
+class Handler(BaseHTTPRequestHandler):
+    server_version = "JiboAIBridge/1.0"
+
+    def do_POST(self):
+        try:
+            client = f"{self.client_address[0]}:{self.client_address[1]}"
+            length = int(self.headers.get("Content-Length", "0") or "0")
+            _log(f"{client} POST {self.path} len={length}")
+
+            if self.path == "/v1/chat/text":
+                payload = _read_json(self)
+                text = (payload.get("text") or "").strip()
+                if not text:
+                    _json_response(self, 400, {"error": "Missing 'text'"})
+                    return
+                _log(f"{client} /text prompt_chars={len(text)} prompt={text[:200]!r}")
+                reply = _ollama_chat(text)
+                _log(f"{client} /text ok reply_chars={len(reply)}")
+                _json_response(self, 200, {"reply": reply})
+                return
+
+            if self.path == "/v1/chat/audio":
+                payload = _read_json(self)
+                b64 = payload.get("wav_base64")
+                if not b64:
+                    _json_response(self, 400, {"error": "Missing 'wav_base64'"})
+                    return
+
+                if not _whisper.available():
+                    _log(f"{client} /audio STT unavailable (faster-whisper not installed)")
+                    _json_response(
+                        self,
+                        503,
+                        {
+                            "error": "STT unavailable: install faster-whisper and ffmpeg on this PC",
+                            "hint": "pip install faster-whisper  (and install ffmpeg)",
+                        },
+                    )
+                    return
+
+                wav_bytes = base64.b64decode(b64)
+                diag = _wav_diagnostics(wav_bytes)
+                wav_for_stt, conv = _to_loudest_channel_mono_wav(wav_bytes)
+                if conv.get("converted"):
+                    diag_mono = _wav_diagnostics(wav_for_stt)
+                    _log(
+                        f"{client} /audio wav_diag={json.dumps(diag, sort_keys=True)} "
+                        f"mono={json.dumps(diag_mono, sort_keys=True)} conv={json.dumps(conv, sort_keys=True)}"
+                    )
+                else:
+                    _log(f"{client} /audio wav_diag={json.dumps(diag, sort_keys=True)} conv={json.dumps(conv, sort_keys=True)}")
+                try:
+                    # Save for debugging (overwrite each time)
+                    out_path = os.environ.get("AI_BRIDGE_LAST_WAV", "jibo_last.wav")
+                    with open(out_path, "wb") as f:
+                        f.write(wav_bytes)
+                    _log(f"{client} /audio decoded bytes={len(wav_bytes)} saved={out_path}")
+                except Exception as e:
+                    _log(f"{client} /audio failed saving wav: {e}")
+
+                transcript = _whisper.transcribe_wav_bytes(wav_for_stt)
+                if not transcript:
+                    _log(f"{client} /audio empty transcript")
+                    _json_response(self, 200, {"reply": "I didn't catch that. Could you say it again?", "text": ""})
+                    return
+                _log(f"{client} /audio transcript_chars={len(transcript)} transcript={transcript[:200]!r}")
+                reply = _ollama_chat(transcript)
+                _log(f"{client} /audio ok text_chars={len(transcript)} reply_chars={len(reply)}")
+                _json_response(self, 200, {"reply": reply, "text": transcript})
+                return
+
+            _json_response(self, 404, {"error": "Not found"})
+
+        except Exception as e:
+            _log(f"ERROR {self.path}: {e}\n{traceback.format_exc()}")
+            _json_response(
+                self,
+                500,
+                {
+                    "error": str(e),
+                    "trace": traceback.format_exc(),
+                },
+            )
+
+    def log_message(self, format, *args):
+        # Keep BaseHTTPRequestHandler from double-logging; we do our own.
+        return
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--host", default="0.0.0.0")
+    ap.add_argument("--port", type=int, default=8020)
+    args = ap.parse_args()
+
+    server = ThreadingHTTPServer((args.host, args.port), Handler)
+    _log(f"AI Bridge server listening on http://{args.host}:{args.port}")
+    _log(
+        "Ollama: "
+        + os.environ.get("OLLAMA_URL", "http://127.0.0.1:11434/api/chat")
+        + " model="
+        + os.environ.get("OLLAMA_MODEL", "phi3.5")
+    )
+    if not _whisper.available():
+        _log("STT: faster-whisper not installed; /v1/chat/audio will return 503")
+    server.serve_forever()
+
+
+if __name__ == "__main__":
+    main()