umm ai bridge upgrades
This commit is contained in:
@@ -1,15 +1,23 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Minimal AI Bridge server for Jibo.
|
||||
"""Streaming AI Bridge server for Jibo.
|
||||
|
||||
Replicates the original Jibo server pattern: responds sentence-by-sentence
|
||||
so the robot can start speaking immediately while the LLM is still generating.
|
||||
|
||||
Endpoints:
|
||||
- POST /v1/chat/text {"text": "..."} -> {"reply": "..."}
|
||||
- POST /v1/chat/audio {"wav_base64": "...", "sample_rate": 16000} -> {"reply": "...", "text": "<transcript>"}
|
||||
- POST /v1/chat/text {"text": "..."} -> chunked NDJSON, one line per sentence:
|
||||
{"sentence": "First sentence.", "done": false}
|
||||
{"sentence": "Second sentence!", "done": false}
|
||||
{"sentence": "", "done": true, "reply": "First sentence. Second sentence!"}
|
||||
Legacy (non-streaming) clients still work: the final line has the full "reply".
|
||||
|
||||
- POST /v1/chat/audio {"wav_base64": "...", "sample_rate": 16000} -> same streaming format
|
||||
|
||||
LLM:
|
||||
- Uses Ollama Chat API by default: http://localhost:11434/api/chat
|
||||
- Uses Ollama Chat API with streaming: http://localhost:11434/api/chat
|
||||
Env:
|
||||
OLLAMA_URL (default: http://127.0.0.1:11434/api/chat)
|
||||
OLLAMA_MODEL (default: phi3.5)
|
||||
OLLAMA_MODEL (default: smollm2:135m)
|
||||
|
||||
STT (optional, for /audio):
|
||||
- If `faster-whisper` is installed, it will be used.
|
||||
@@ -63,8 +71,9 @@ def _read_json(handler: BaseHTTPRequestHandler) -> dict:
|
||||
|
||||
|
||||
def _ollama_chat(user_text: str) -> str:
|
||||
"""Non-streaming fallback (used by /audio endpoint)."""
|
||||
ollama_url = os.environ.get("OLLAMA_URL", "http://127.0.0.1:11434/api/chat")
|
||||
model = os.environ.get("OLLAMA_MODEL", "phi3.5")
|
||||
model = os.environ.get("OLLAMA_MODEL", "smollm2:135m")
|
||||
|
||||
req_body = {
|
||||
"model": model,
|
||||
@@ -86,7 +95,6 @@ def _ollama_chat(user_text: str) -> str:
|
||||
with urlopen(req, timeout=60) as resp:
|
||||
data = json.loads(resp.read().decode("utf-8"))
|
||||
except Exception as e:
|
||||
# Let caller decide how to respond; include a useful hint in logs.
|
||||
_log(f"Ollama request failed url={ollama_url!r} err={e!r}")
|
||||
raise
|
||||
|
||||
@@ -97,6 +105,95 @@ def _ollama_chat(user_text: str) -> str:
|
||||
return content.strip()
|
||||
|
||||
|
||||
import re
|
||||
|
||||
# Sentence boundary: split on .!? followed by space or end, but not on
|
||||
# abbreviations like "Dr." or "Mr." or decimals like "3.5".
|
||||
_SENTENCE_END_RE = re.compile(
|
||||
r'(?<=[.!?])\s+(?=[A-Z"\'])|(?<=[.!?])$'
|
||||
)
|
||||
|
||||
|
||||
def _split_sentences(text: str) -> list[str]:
|
||||
"""Split text into sentences. Returns list of sentence strings."""
|
||||
parts = _SENTENCE_END_RE.split(text.strip())
|
||||
return [p.strip() for p in parts if p.strip()]
|
||||
|
||||
|
||||
def _ollama_chat_streaming(user_text: str, on_sentence):
|
||||
"""Stream Ollama response token-by-token, calling on_sentence(sentence_str)
|
||||
each time a complete sentence is detected. Returns the full reply string.
|
||||
|
||||
This is the key to sub-5-second perceived latency: the robot starts speaking
|
||||
the first sentence while the LLM is still generating the rest.
|
||||
"""
|
||||
ollama_url = os.environ.get("OLLAMA_URL", "http://127.0.0.1:11434/api/chat")
|
||||
model = os.environ.get("OLLAMA_MODEL", "smollm2:135m")
|
||||
|
||||
req_body = {
|
||||
"model": model,
|
||||
"stream": True,
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are Jibo, a friendly home robot. Keep replies short and spoken."},
|
||||
{"role": "user", "content": user_text},
|
||||
],
|
||||
}
|
||||
|
||||
req = Request(
|
||||
ollama_url,
|
||||
data=json.dumps(req_body).encode("utf-8"),
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
|
||||
full_reply = ""
|
||||
buffer = ""
|
||||
sentences_sent = 0
|
||||
t0 = time.monotonic()
|
||||
|
||||
with urlopen(req, timeout=120) as resp:
|
||||
# Ollama streams one JSON object per line
|
||||
for raw_line in resp:
|
||||
line = raw_line.decode("utf-8", errors="replace").strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
chunk = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# Each chunk: {"message": {"content": "token"}, "done": false}
|
||||
msg = chunk.get("message") or {}
|
||||
token = msg.get("content") or ""
|
||||
if token:
|
||||
buffer += token
|
||||
full_reply += token
|
||||
|
||||
# Check if buffer contains a complete sentence to flush
|
||||
# Look for sentence-ending punctuation followed by a space or more tokens
|
||||
sentences = _split_sentences(buffer)
|
||||
if len(sentences) > 1:
|
||||
# All but the last are complete sentences; last is still building
|
||||
for s in sentences[:-1]:
|
||||
elapsed = time.monotonic() - t0
|
||||
_log(f" sentence #{sentences_sent} at {elapsed:.2f}s: {s[:120]!r}")
|
||||
on_sentence(s)
|
||||
sentences_sent += 1
|
||||
buffer = sentences[-1]
|
||||
|
||||
if chunk.get("done"):
|
||||
break
|
||||
|
||||
# Flush any remaining text as the final sentence
|
||||
leftover = buffer.strip()
|
||||
if leftover:
|
||||
elapsed = time.monotonic() - t0
|
||||
_log(f" sentence #{sentences_sent} (final) at {elapsed:.2f}s: {leftover[:120]!r}")
|
||||
on_sentence(leftover)
|
||||
|
||||
return full_reply.strip()
|
||||
|
||||
|
||||
def _short_err(e: BaseException) -> str:
|
||||
s = str(e) or e.__class__.__name__
|
||||
s = " ".join(s.split())
|
||||
@@ -265,15 +362,55 @@ class Handler(BaseHTTPRequestHandler):
|
||||
return
|
||||
_log(f"{client} /text prompt_chars={len(text)} prompt={text[:200]!r}")
|
||||
try:
|
||||
reply = _ollama_chat(text)
|
||||
_log(f"{client} /text ok reply_chars={len(reply)}")
|
||||
_json_response(self, 200, {"reply": reply})
|
||||
except URLError as e:
|
||||
# Stream response: chunked transfer encoding with NDJSON
|
||||
# Each line is a JSON object the robot can parse immediately.
|
||||
# This replicates the original Jibo hub pattern where the robot
|
||||
# starts acting on partial results while the server is still working.
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "application/x-ndjson")
|
||||
self.send_header("Transfer-Encoding", "chunked")
|
||||
self.end_headers()
|
||||
|
||||
t0 = time.monotonic()
|
||||
|
||||
def _send_chunk(obj: dict):
|
||||
line = json.dumps(obj) + "\n"
|
||||
data = line.encode("utf-8")
|
||||
# HTTP chunked encoding: hex size + CRLF + data + CRLF
|
||||
self.wfile.write(f"{len(data):x}\r\n".encode())
|
||||
self.wfile.write(data)
|
||||
self.wfile.write(b"\r\n")
|
||||
self.wfile.flush()
|
||||
|
||||
def _on_sentence(sentence: str):
|
||||
_send_chunk({"sentence": sentence, "done": False})
|
||||
|
||||
reply = _ollama_chat_streaming(text, _on_sentence)
|
||||
elapsed = time.monotonic() - t0
|
||||
_log(f"{client} /text ok reply_chars={len(reply)} elapsed={elapsed:.2f}s")
|
||||
|
||||
# Final chunk with full reply (for logging / legacy compat)
|
||||
_send_chunk({"sentence": "", "done": True, "reply": reply})
|
||||
# Terminate chunked encoding
|
||||
self.wfile.write(b"0\r\n\r\n")
|
||||
self.wfile.flush()
|
||||
|
||||
except (URLError, ConnectionRefusedError) as e:
|
||||
_log(f"{client} /text ollama_unreachable err={_short_err(e)!r}")
|
||||
_json_response(self, 200, {"reply": _ollama_down_reply(), "ollama_ok": False, "ollama_error": _short_err(e)})
|
||||
except ConnectionRefusedError as e:
|
||||
_log(f"{client} /text ollama_refused err={_short_err(e)!r}")
|
||||
_json_response(self, 200, {"reply": _ollama_down_reply(), "ollama_ok": False, "ollama_error": _short_err(e)})
|
||||
# Fallback: try to send a non-streaming error response.
|
||||
# If headers already sent, write it as a chunk.
|
||||
try:
|
||||
err_reply = _ollama_down_reply()
|
||||
err_obj = {"sentence": err_reply, "done": True, "reply": err_reply,
|
||||
"ollama_ok": False, "ollama_error": _short_err(e)}
|
||||
line = json.dumps(err_obj) + "\n"
|
||||
data = line.encode("utf-8")
|
||||
self.wfile.write(f"{len(data):x}\r\n".encode())
|
||||
self.wfile.write(data)
|
||||
self.wfile.write(b"\r\n0\r\n\r\n")
|
||||
self.wfile.flush()
|
||||
except Exception:
|
||||
pass # headers may not have been sent yet
|
||||
return
|
||||
|
||||
if self.path == "/v1/chat/audio":
|
||||
@@ -371,7 +508,7 @@ def main():
|
||||
"Ollama: "
|
||||
+ os.environ.get("OLLAMA_URL", "http://127.0.0.1:11434/api/chat")
|
||||
+ " model="
|
||||
+ os.environ.get("OLLAMA_MODEL", "phi3.5")
|
||||
+ os.environ.get("OLLAMA_MODEL", "smollm2:135m")
|
||||
)
|
||||
_log("Ollama health check: curl -s http://127.0.0.1:11434/api/tags | head")
|
||||
if not _whisper.available():
|
||||
|
||||
Reference in New Issue
Block a user