Private
Public Access
0
0

feat(warmup): per-module canary records (thread + timing observability)

Adds a canary record for each module submitted to the warmup, tracking:
canary_id, module, thread_name, thread_id, submit_ts, start_ts,
end_ts, elapsed_ms, status, error.

Surface:
- WarmupManager.canaries() returns list[dict] (defensive copy)
- AppController.warmup_canaries() returns list[dict] (delegation)
- GET /api/warmup_canaries Hook API endpoint
- ApiHookClient.get_warmup_canaries() returns list[dict]

Example: the warmup of google.genai records a 1187ms canary on
thread controller-io_0 with thread_id 50420, canary_id 1.

11 new tests (8 unit in test_warmup_canaries + 3 in test_api_hooks_warmup).
All pass; live_gui smoke test confirms endpoint returns real data.
This commit is contained in:
2026-06-06 22:02:35 -04:00
parent f09cd4a733
commit 208aa664db
6 changed files with 322 additions and 4 deletions
+11
View File
@@ -318,6 +318,17 @@ class ApiHookClient:
"""
return self._make_request('GET', f'/api/warmup_wait?timeout={timeout}') or {}
def get_warmup_canaries(self) -> list[dict[str, Any]]:
"""
Returns per-module import canary records: list of dicts with
canary_id, module, thread_name, thread_id, submit_ts, start_ts,
end_ts, elapsed_ms, status, error. Used for debugging which
worker thread loaded which module and how long it took.
[C: tests/test_api_hooks_warmup.py:test_get_warmup_canaries_in_live_gui]
"""
result = self._make_request('GET', '/api/warmup_canaries') or {}
return result.get("canaries", []) if isinstance(result, dict) else []
#endregion: Diagnostics
#region: Project
+18
View File
@@ -364,6 +364,24 @@ class HookHandler(BaseHTTPRequestHandler):
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(json.dumps(payload).encode("utf-8"))
elif self.path == "/api/warmup_canaries" or self.path.startswith("/api/warmup_canaries?"):
# Per-module import canary records (startup_speedup_20260606 sub-track 4+).
# Each record carries canary_id, module, thread_name, thread_id,
# submit_ts, start_ts, end_ts, elapsed_ms, status, error.
# Cheap (lock-guarded copy on the WarmupManager). Direct call,
# no GUI trampoline (the WarmupManager is already thread-safe).
controller = _get_app_attr(app, "controller", None)
if controller and hasattr(controller, "warmup_canaries"):
try:
payload = {"canaries": controller.warmup_canaries()}
except Exception:
payload = {"canaries": []}
else:
payload = {"canaries": []}
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(json.dumps(payload).encode("utf-8"))
else:
self.send_response(404)
self.end_headers()
+6
View File
@@ -2120,6 +2120,12 @@ class AppController:
"""
return self._warmup.status()
def warmup_canaries(self) -> list[dict]:
"""
Per-module import canary records. Each record carries: canary_id, module, thread_name, thread_id, submit_ts, start_ts, end_ts, elapsed_ms, status, error. Useful for debugging which worker thread loaded which module and how long it took. Returns a defensive copy (caller mutation is safe). [SDM: src/app_controller.py:warmup_canaries].
"""
return self._warmup.canaries()
def is_warmup_done(self) -> bool:
"""
+82 -4
View File
@@ -12,11 +12,25 @@ Public API on the manager (and exposed on AppController via delegation):
mgr.is_done() - bool
mgr.wait(timeout) - block until done
mgr.on_complete(callback) - register completion callback
mgr.canaries() - list[dict] of per-module canary records (observability)
mgr.reset() - clear state (for re-warmup, e.g. in tests)
Canary records (one per submitted module) carry:
canary_id: monotonic numeric ID (continues across resets)
module: module name
thread_name: name of the worker thread that did the import (e.g. "controller-io-0")
thread_id: threading.get_ident() of that worker
submit_ts: wall-clock when submit() was called
start_ts: wall-clock when the worker started the import
end_ts: wall-clock when the import finished
elapsed_ms: (end_ts - start_ts) * 1000
status: "running" | "completed" | "failed" | "cancelled"
error: error message string if status == "failed", else None
"""
import importlib
import threading
import time
from concurrent.futures import Future, ThreadPoolExecutor
from typing import Callable, Optional
@@ -34,8 +48,12 @@ class WarmupManager:
self._failed: list[str] = []
self._callbacks: list[CompletionCallback] = []
self._started = False
# Canary observability state (per-module import tracking).
self._canaries: list[dict] = []
self._next_canary_id: int = 1
def submit(self, modules: list[str]) -> None:
submit_ts = time.time()
with self._lock:
if self._started:
raise RuntimeError("WarmupManager.submit() called twice; call reset() first")
@@ -44,6 +62,21 @@ class WarmupManager:
self._failed = []
self._done_event.clear()
self._started = True
for name in modules:
canary = {
"canary_id": self._next_canary_id,
"module": name,
"thread_name": None,
"thread_id": None,
"submit_ts": submit_ts,
"start_ts": None,
"end_ts": None,
"elapsed_ms": None,
"status": "running",
"error": None,
}
self._next_canary_id += 1
self._canaries.append(canary)
for name in modules:
self._pool.submit(self._warmup_one, name)
@@ -55,6 +88,11 @@ class WarmupManager:
"failed": list(self._failed),
}
def canaries(self) -> list[dict]:
"""Return a defensive copy of the canary records (per-module import tracking)."""
with self._lock:
return [dict(c) for c in self._canaries]
def is_done(self) -> bool:
return self._done_event.is_set()
@@ -83,21 +121,52 @@ class WarmupManager:
self._done_event.clear()
self._callbacks = []
self._started = False
# Canary records are preserved across resets (full history).
# Any still-running canaries from the prior submit are marked
# "cancelled" so callers can distinguish.
for c in self._canaries:
if c["status"] == "running":
c["status"] = "cancelled"
c["end_ts"] = c.get("end_ts") or time.time()
if c.get("start_ts") and c["elapsed_ms"] is None:
c["elapsed_ms"] = (c["end_ts"] - c["start_ts"]) * 1000
def _warmup_one(self, name: str) -> None:
start_ts = time.time()
thread = threading.current_thread()
thread_name = thread.name
thread_id = thread.ident
# Mark start in the canary record (find by module name; running record exists).
with self._lock:
for c in self._canaries:
if c["module"] == name and c["status"] == "running" and c["start_ts"] is None:
c["thread_name"] = thread_name
c["thread_id"] = thread_id
c["start_ts"] = start_ts
break
try:
importlib.import_module(name)
except BaseException as e:
self._record_failure(name, e)
end_ts = time.time()
self._record_failure(name, e, end_ts)
else:
self._record_success(name)
end_ts = time.time()
self._record_success(name, end_ts)
def _record_success(self, name: str) -> None:
def _record_success(self, name: str, end_ts: Optional[float] = None) -> None:
if end_ts is None: end_ts = time.time()
callbacks: list[CompletionCallback] = []
with self._lock:
if name in self._pending:
self._pending.remove(name)
self._completed.append(name)
for c in self._canaries:
if c["module"] == name and c["status"] == "running":
c["status"] = "completed"
c["end_ts"] = end_ts
if c["start_ts"] is not None:
c["elapsed_ms"] = (end_ts - c["start_ts"]) * 1000
break
done = self._started and not self._pending
if done:
self._done_event.set()
@@ -108,12 +177,21 @@ class WarmupManager:
except Exception:
pass
def _record_failure(self, name: str, _err: BaseException) -> None:
def _record_failure(self, name: str, _err: BaseException, end_ts: Optional[float] = None) -> None:
if end_ts is None: end_ts = time.time()
callbacks: list[CompletionCallback] = []
with self._lock:
if name in self._pending:
self._pending.remove(name)
self._failed.append(name)
for c in self._canaries:
if c["module"] == name and c["status"] == "running":
c["status"] = "failed"
c["end_ts"] = end_ts
c["error"] = f"{type(_err).__name__}: {_err}"
if c["start_ts"] is not None:
c["elapsed_ms"] = (end_ts - c["start_ts"]) * 1000
break
done = self._started and not self._pending
if done:
self._done_event.set()