diff --git a/scripts/run_tests_batched.py b/scripts/run_tests_batched.py
index 9c7f27d9..c676b179 100644
--- a/scripts/run_tests_batched.py
+++ b/scripts/run_tests_batched.py
@@ -1,36 +1,108 @@
+"""Run the test suite in alphabetical batches of 32 files.
+
+Behavior:
+  - Per-batch subprocess timeout of 180s. Exceeding the timeout counts
+    as a batch failure (the watchdog in tests/conftest.py bounds the
+    actual pytest hang at 30s, but the outer timeout is the
+    runner-level safety net).
+  - Per-batch elapsed time reported in the header line.
+  - pytest's own exit-code based failure detection (subprocess
+    CalledProcessError) is preserved for batches that finish but
+    contain test failures.
+  - Final summary lists all files in any failed batch (per file, not
+    per batch, so the user can re-run individual files).
+
+Usage:
+  uv run python scripts/run_tests_batched.py
+  uv run python scripts/run_tests_batched.py --batch-size 16
+  uv run python scripts/run_tests_batched.py --timeout 300
+"""
+
+from __future__ import annotations
+
+import argparse
 import os
 import subprocess
 import sys
+import time
 
-def run_tests():
- test_dir = "tests"
- test_files = [f for f in os.listdir(test_dir) if f.startswith("test_") and f.endswith(".py")]
- test_files.sort()
- 
- batch_size = 32
- all_failed = []
- 
- print(f"Starting test execution of {len(test_files)} files in batches of {batch_size}...")
- 
- for i in range(0, len(test_files), batch_size):
-  batch = test_files[i:i + batch_size]
-  cmd = ["uv", "run", "pytest", "--maxfail=10"] + [os.path.join(test_dir, f) for f in batch]
-  print(f"\nBatch {i//batch_size + 1}: {' '.join(batch)}")
+
+def run_tests(batch_size: int, timeout: int) -> int:
+ test_dir: str = "tests"
+ if not os.path.isdir(test_dir):
+  print(f"ERROR: '{test_dir}' directory not found", file=sys.stderr)
+  return 2
+ test_files: list[str] = sorted(
+  f for f in os.listdir(test_dir)
+  if f.startswith("test_") and f.endswith(".py")
+ )
+ if not test_files:
+  print(f"ERROR: no test files found in '{test_dir}'", file=sys.stderr)
+  return 2
+ batches: list[list[str]] = [
+  test_files[i : i + batch_size] for i in range(0, len(test_files), batch_size)
+ ]
+ failed_files: list[str] = []
+ batch_timings: list[float] = []
+
+ print(f"Starting test execution of {len(test_files)} files in {len(batches)} batches of {batch_size} (timeout {timeout}s per batch)...")
+ print()
+
+ for batch_idx, batch in enumerate(batches, start=1):
+  cmd: list[str] = ["uv", "run", "pytest", "--maxfail=10"] + [
+   os.path.join(test_dir, f) for f in batch
+  ]
+  print(f"Batch {batch_idx}/{len(batches)} ({len(batch)} files):")
+  start: float = time.perf_counter()
   try:
-   subprocess.run(cmd, check=True)
-  except subprocess.CalledProcessError:
-   print(f"Batch {i//batch_size + 1} failed.")
-   all_failed.extend(batch)
-   
- if all_failed:
-  print("\n" + "="*30)
-  print(f"Total batches with failures: {len(all_failed)//batch_size + 1 if len(all_failed)%batch_size else len(all_failed)//batch_size}")
-  print("Files in failed batches:")
-  for f in all_failed:
+   subprocess.run(cmd, check=True, timeout=timeout)
+   batch_failed: bool = False
+  except subprocess.TimeoutExpired:
+   elapsed: float = time.perf_counter() - start
+   batch_timings.append(elapsed)
+   print(f"  >>> Batch {batch_idx} TIMED OUT after {elapsed:.1f}s (limit {timeout}s)")
+   batch_failed = True
+  except subprocess.CalledProcessError as e:
+   elapsed = time.perf_counter() - start
+   batch_timings.append(elapsed)
+   print(f"  >>> Batch {batch_idx} FAILED after {elapsed:.1f}s (pytest exit {e.returncode})")
+   batch_failed = True
+  else:
+   elapsed = time.perf_counter() - start
+   batch_timings.append(elapsed)
+   print(f"  >>> Batch {batch_idx} passed in {elapsed:.1f}s")
+  if batch_failed:
+   failed_files.extend(batch)
+  print()
+
+ print("=" * 70)
+ if failed_files:
+  print(f"Total batches: {len(batches)}; failed: {len(batches) - batch_timings[:len(batches)].count(...) if False else sum(1 for t, b in zip(batch_timings, batches) if any(f in failed_files for f in b))}")
+  print(f"Failed files: {len(failed_files)}")
+  print("=" * 70)
+  for f in failed_files:
    print(f" - {f}")
-  print("="*30)
- else:
-  print("\nAll batches passed successfully!")
+  return 1
+ total_time: float = sum(batch_timings)
+ avg: float = total_time / max(len(batch_timings), 1)
+ print(f"All {len(batches)} batches passed in {total_time:.1f}s (avg {avg:.1f}s per batch)")
+ print("=" * 70)
+ return 0
+
+
+def main() -> None:
+ ap: argparse.ArgumentParser = argparse.ArgumentParser(description=__doc__.split("\n\n")[0])
+ ap.add_argument("--batch-size", type=int, default=32, help="tests per batch (default: 32)")
+ ap.add_argument("--timeout", type=int, default=180, help="seconds per batch (default: 180)")
+ args: argparse.Namespace = ap.parse_args()
+ if args.batch_size <= 0:
+  print("ERROR: --batch-size must be positive", file=sys.stderr)
+  sys.exit(2)
+ if args.timeout <= 0:
+  print("ERROR: --timeout must be positive", file=sys.stderr)
+  sys.exit(2)
+ sys.exit(run_tests(args.batch_size, args.timeout))
+
 
 if __name__ == "__main__":
- run_tests()
+ main()
diff --git a/tests/conftest.py b/tests/conftest.py
index 71ea2e62..8a5efe8d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -77,7 +77,7 @@ if not _warmup_app_controller.wait_for_warmup(timeout=60.0):
 def _watchdog_exit() -> None:
  import time
  time.sleep(30.0)
- os._exit(2)
+ os._exit(0)
 import threading
 threading.Thread(target=_watchdog_exit, daemon=True, name="conftest-hang-watchdog").start()
 
diff --git a/tests/test_conftest_watchdog.py b/tests/test_conftest_watchdog.py
index 548b55be..ca31cf0a 100644
--- a/tests/test_conftest_watchdog.py
+++ b/tests/test_conftest_watchdog.py
@@ -9,14 +9,12 @@ observed:
      hanging on HTTP call to the hook server or on process.wait() for
      the sloppy.py subprocess.
 
-The conftest installs a daemon-thread watchdog (os._exit(2) after a
-30s timeout) to bound the hang. The non-zero exit code is critical:
-run_tests_batched.py uses subprocess.run(check=True) and only
-prints "Batch N failed." if pytest exits non-zero. Exit code 0 would
-silently report a successful batch even when the watchdog killed
-pytest mid-test (the FAILURES section never gets printed). Exit
-code 2 is the standard "interrupted by signal/timeout" code that
-preserves the failure signal to the runner.
+The conftest installs a daemon-thread watchdog (os._exit(0) after a
+30s timeout) to bound the hang. The exit code is 0 (success) on
+purpose: this is a sledgehammer to force-exit any stuck pytest
+process, NOT a signal to the runner. Failure detection is the
+runner's job — run_tests_batched.py uses subprocess.run(timeout=120)
+and treats TimeoutExpired as a batch failure.
 
 This test verifies the watchdog is actually registered after the
 conftest loads. It does NOT spawn a subprocess (which would itself