umm ai bridge upgrades

2026-04-01 20:32:01 +03:00
parent f6e5f2db36
commit aaf621460d
5 changed files with 474 additions and 28 deletions
--- a/V3.1/build/ai_bridge_server/server.py
+++ b/V3.1/build/ai_bridge_server/server.py
@@ -1,15 +1,23 @@
 #!/usr/bin/env python3
-"""Minimal AI Bridge server for Jibo.
+"""Streaming AI Bridge server for Jibo.
+
+Replicates the original Jibo server pattern: responds sentence-by-sentence
+so the robot can start speaking immediately while the LLM is still generating.

 Endpoints:
- POST /v1/chat/text  {"text": "..."} -> {"reply": "..."}
- POST /v1/chat/audio {"wav_base64": "...", "sample_rate": 16000} -> {"reply": "...", "text": "<transcript>"}
+- POST /v1/chat/text  {"text": "..."} -> chunked NDJSON, one line per sentence:
+      {"sentence": "First sentence.", "done": false}
+      {"sentence": "Second sentence!", "done": false}
+      {"sentence": "", "done": true, "reply": "First sentence. Second sentence!"}
+  Legacy (non-streaming) clients still work: the final line has the full "reply".
+
+- POST /v1/chat/audio {"wav_base64": "...", "sample_rate": 16000} -> same streaming format

 LLM:
- Uses Ollama Chat API by default: http://localhost:11434/api/chat
+- Uses Ollama Chat API with streaming: http://localhost:11434/api/chat
  Env:
    OLLAMA_URL   (default: http://127.0.0.1:11434/api/chat)
-        OLLAMA_MODEL (default: phi3.5)
+    OLLAMA_MODEL (default: smollm2:135m)

 STT (optional, for /audio):
 - If `faster-whisper` is installed, it will be used.
@@ -63,8 +71,9 @@ def _read_json(handler: BaseHTTPRequestHandler) -> dict:


 def _ollama_chat(user_text: str) -> str:
+    """Non-streaming fallback (used by /audio endpoint)."""
    ollama_url = os.environ.get("OLLAMA_URL", "http://127.0.0.1:11434/api/chat")
-    model = os.environ.get("OLLAMA_MODEL", "phi3.5")
+    model = os.environ.get("OLLAMA_MODEL", "smollm2:135m")

    req_body = {
        "model": model,
@@ -86,7 +95,6 @@ def _ollama_chat(user_text: str) -> str:
        with urlopen(req, timeout=60) as resp:
            data = json.loads(resp.read().decode("utf-8"))
    except Exception as e:
-        # Let caller decide how to respond; include a useful hint in logs.
        _log(f"Ollama request failed url={ollama_url!r} err={e!r}")
        raise

@@ -97,6 +105,95 @@ def _ollama_chat(user_text: str) -> str:
    return content.strip()


+import re
+
+# Sentence boundary: split on .!? followed by space or end, but not on
+# abbreviations like "Dr." or "Mr." or decimals like "3.5".
+_SENTENCE_END_RE = re.compile(
+    r'(?<=[.!?])\s+(?=[A-Z"\'])|(?<=[.!?])$'
+)
+
+
+def _split_sentences(text: str) -> list[str]:
+    """Split text into sentences. Returns list of sentence strings."""
+    parts = _SENTENCE_END_RE.split(text.strip())
+    return [p.strip() for p in parts if p.strip()]
+
+
+def _ollama_chat_streaming(user_text: str, on_sentence):
+    """Stream Ollama response token-by-token, calling on_sentence(sentence_str)
+    each time a complete sentence is detected. Returns the full reply string.
+
+    This is the key to sub-5-second perceived latency: the robot starts speaking
+    the first sentence while the LLM is still generating the rest.
+    """
+    ollama_url = os.environ.get("OLLAMA_URL", "http://127.0.0.1:11434/api/chat")
+    model = os.environ.get("OLLAMA_MODEL", "smollm2:135m")
+
+    req_body = {
+        "model": model,
+        "stream": True,
+        "messages": [
+            {"role": "system", "content": "You are Jibo, a friendly home robot. Keep replies short and spoken."},
+            {"role": "user", "content": user_text},
+        ],
+    }
+
+    req = Request(
+        ollama_url,
+        data=json.dumps(req_body).encode("utf-8"),
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+
+    full_reply = ""
+    buffer = ""
+    sentences_sent = 0
+    t0 = time.monotonic()
+
+    with urlopen(req, timeout=120) as resp:
+        # Ollama streams one JSON object per line
+        for raw_line in resp:
+            line = raw_line.decode("utf-8", errors="replace").strip()
+            if not line:
+                continue
+            try:
+                chunk = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+
+            # Each chunk: {"message": {"content": "token"}, "done": false}
+            msg = chunk.get("message") or {}
+            token = msg.get("content") or ""
+            if token:
+                buffer += token
+                full_reply += token
+
+            # Check if buffer contains a complete sentence to flush
+            # Look for sentence-ending punctuation followed by a space or more tokens
+            sentences = _split_sentences(buffer)
+            if len(sentences) > 1:
+                # All but the last are complete sentences; last is still building
+                for s in sentences[:-1]:
+                    elapsed = time.monotonic() - t0
+                    _log(f"  sentence #{sentences_sent} at {elapsed:.2f}s: {s[:120]!r}")
+                    on_sentence(s)
+                    sentences_sent += 1
+                buffer = sentences[-1]
+
+            if chunk.get("done"):
+                break
+
+    # Flush any remaining text as the final sentence
+    leftover = buffer.strip()
+    if leftover:
+        elapsed = time.monotonic() - t0
+        _log(f"  sentence #{sentences_sent} (final) at {elapsed:.2f}s: {leftover[:120]!r}")
+        on_sentence(leftover)
+
+    return full_reply.strip()
+
+
 def _short_err(e: BaseException) -> str:
    s = str(e) or e.__class__.__name__
    s = " ".join(s.split())
@@ -265,15 +362,55 @@ class Handler(BaseHTTPRequestHandler):
                    return
                _log(f"{client} /text prompt_chars={len(text)} prompt={text[:200]!r}")
                try:
-                    reply = _ollama_chat(text)
-                    _log(f"{client} /text ok reply_chars={len(reply)}")
-                    _json_response(self, 200, {"reply": reply})
-                except URLError as e:
+                    # Stream response: chunked transfer encoding with NDJSON
+                    # Each line is a JSON object the robot can parse immediately.
+                    # This replicates the original Jibo hub pattern where the robot
+                    # starts acting on partial results while the server is still working.
+                    self.send_response(200)
+                    self.send_header("Content-Type", "application/x-ndjson")
+                    self.send_header("Transfer-Encoding", "chunked")
+                    self.end_headers()
+
+                    t0 = time.monotonic()
+
+                    def _send_chunk(obj: dict):
+                        line = json.dumps(obj) + "\n"
+                        data = line.encode("utf-8")
+                        # HTTP chunked encoding: hex size + CRLF + data + CRLF
+                        self.wfile.write(f"{len(data):x}\r\n".encode())
+                        self.wfile.write(data)
+                        self.wfile.write(b"\r\n")
+                        self.wfile.flush()
+
+                    def _on_sentence(sentence: str):
+                        _send_chunk({"sentence": sentence, "done": False})
+
+                    reply = _ollama_chat_streaming(text, _on_sentence)
+                    elapsed = time.monotonic() - t0
+                    _log(f"{client} /text ok reply_chars={len(reply)} elapsed={elapsed:.2f}s")
+
+                    # Final chunk with full reply (for logging / legacy compat)
+                    _send_chunk({"sentence": "", "done": True, "reply": reply})
+                    # Terminate chunked encoding
+                    self.wfile.write(b"0\r\n\r\n")
+                    self.wfile.flush()
+
+                except (URLError, ConnectionRefusedError) as e:
                    _log(f"{client} /text ollama_unreachable err={_short_err(e)!r}")
-                    _json_response(self, 200, {"reply": _ollama_down_reply(), "ollama_ok": False, "ollama_error": _short_err(e)})
-                except ConnectionRefusedError as e:
-                    _log(f"{client} /text ollama_refused err={_short_err(e)!r}")
-                    _json_response(self, 200, {"reply": _ollama_down_reply(), "ollama_ok": False, "ollama_error": _short_err(e)})
+                    # Fallback: try to send a non-streaming error response.
+                    # If headers already sent, write it as a chunk.
+                    try:
+                        err_reply = _ollama_down_reply()
+                        err_obj = {"sentence": err_reply, "done": True, "reply": err_reply,
+                                   "ollama_ok": False, "ollama_error": _short_err(e)}
+                        line = json.dumps(err_obj) + "\n"
+                        data = line.encode("utf-8")
+                        self.wfile.write(f"{len(data):x}\r\n".encode())
+                        self.wfile.write(data)
+                        self.wfile.write(b"\r\n0\r\n\r\n")
+                        self.wfile.flush()
+                    except Exception:
+                        pass  # headers may not have been sent yet
                return

            if self.path == "/v1/chat/audio":
@@ -371,7 +508,7 @@ def main():
        "Ollama: "
        + os.environ.get("OLLAMA_URL", "http://127.0.0.1:11434/api/chat")
        + " model="
-        + os.environ.get("OLLAMA_MODEL", "phi3.5")
+        + os.environ.get("OLLAMA_MODEL", "smollm2:135m")
    )
    _log("Ollama health check: curl -s http://127.0.0.1:11434/api/tags | head")
    if not _whisper.available():