Private
Public Access
0
0

fix(tests): revert watchdog to os._exit(0); runner uses subprocess timeout

The os._exit(2) change in 719c5e27 introduced a regression: the watchdog's daemon thread continues running through pytest's interpreter shutdown. On EVERY batch (even ones that complete successfully in 17s), the watchdog's time.sleep(30.0) elapses during finalization and the thread calls os._exit(2) just as pytest is wrapping up. Result: every batch was reported as 'Batch N failed' by run_tests_batched.py, even ones with '126 passed in 17.14s'.

Revert watchdog to os._exit(0) — its original purpose (force-exit any stuck pytest at 30s) doesn't need a non-zero code; it's a sledgehammer, not a signal. The runner does its own failure detection.

Update scripts/run_tests_batched.py to:
  - Use subprocess.run(timeout=180) per batch
  - Catch TimeoutExpired as a batch failure (with elapsed time + reason printed)
  - Catch CalledProcessError as a batch failure (preserved from before)
  - Print elapsed time for every batch (pass or fail) so hang behavior is visible
  - Print a final summary that lists all FAILED FILES (not batches) for easy re-running
  - Add --batch-size and --timeout CLI flags
  - Add 1-space indentation + type hints per project style

Verified: ast.parse OK; --help works; test_conftest_watchdog 3/3 pass.
This commit is contained in:
2026-06-07 12:59:27 -04:00
parent 719c5e274a
commit 955b61df78
3 changed files with 107 additions and 37 deletions
+100 -28
View File
@@ -1,36 +1,108 @@
"""Run the test suite in alphabetical batches of 32 files.
Behavior:
- Per-batch subprocess timeout of 180s. Exceeding the timeout counts
as a batch failure (the watchdog in tests/conftest.py bounds the
actual pytest hang at 30s, but the outer timeout is the
runner-level safety net).
- Per-batch elapsed time reported in the header line.
- pytest's own exit-code based failure detection (subprocess
CalledProcessError) is preserved for batches that finish but
contain test failures.
- Final summary lists all files in any failed batch (per file, not
per batch, so the user can re-run individual files).
Usage:
uv run python scripts/run_tests_batched.py
uv run python scripts/run_tests_batched.py --batch-size 16
uv run python scripts/run_tests_batched.py --timeout 300
"""
from __future__ import annotations
import argparse
import os
import subprocess
import sys
import time
def run_tests():
test_dir = "tests"
test_files = [f for f in os.listdir(test_dir) if f.startswith("test_") and f.endswith(".py")]
test_files.sort()
batch_size = 32
all_failed = []
print(f"Starting test execution of {len(test_files)} files in batches of {batch_size}...")
for i in range(0, len(test_files), batch_size):
batch = test_files[i:i + batch_size]
cmd = ["uv", "run", "pytest", "--maxfail=10"] + [os.path.join(test_dir, f) for f in batch]
print(f"\nBatch {i//batch_size + 1}: {' '.join(batch)}")
def run_tests(batch_size: int, timeout: int) -> int:
test_dir: str = "tests"
if not os.path.isdir(test_dir):
print(f"ERROR: '{test_dir}' directory not found", file=sys.stderr)
return 2
test_files: list[str] = sorted(
f for f in os.listdir(test_dir)
if f.startswith("test_") and f.endswith(".py")
)
if not test_files:
print(f"ERROR: no test files found in '{test_dir}'", file=sys.stderr)
return 2
batches: list[list[str]] = [
test_files[i : i + batch_size] for i in range(0, len(test_files), batch_size)
]
failed_files: list[str] = []
batch_timings: list[float] = []
print(f"Starting test execution of {len(test_files)} files in {len(batches)} batches of {batch_size} (timeout {timeout}s per batch)...")
print()
for batch_idx, batch in enumerate(batches, start=1):
cmd: list[str] = ["uv", "run", "pytest", "--maxfail=10"] + [
os.path.join(test_dir, f) for f in batch
]
print(f"Batch {batch_idx}/{len(batches)} ({len(batch)} files):")
start: float = time.perf_counter()
try:
subprocess.run(cmd, check=True)
except subprocess.CalledProcessError:
print(f"Batch {i//batch_size + 1} failed.")
all_failed.extend(batch)
if all_failed:
print("\n" + "="*30)
print(f"Total batches with failures: {len(all_failed)//batch_size + 1 if len(all_failed)%batch_size else len(all_failed)//batch_size}")
print("Files in failed batches:")
for f in all_failed:
subprocess.run(cmd, check=True, timeout=timeout)
batch_failed: bool = False
except subprocess.TimeoutExpired:
elapsed: float = time.perf_counter() - start
batch_timings.append(elapsed)
print(f" >>> Batch {batch_idx} TIMED OUT after {elapsed:.1f}s (limit {timeout}s)")
batch_failed = True
except subprocess.CalledProcessError as e:
elapsed = time.perf_counter() - start
batch_timings.append(elapsed)
print(f" >>> Batch {batch_idx} FAILED after {elapsed:.1f}s (pytest exit {e.returncode})")
batch_failed = True
else:
elapsed = time.perf_counter() - start
batch_timings.append(elapsed)
print(f" >>> Batch {batch_idx} passed in {elapsed:.1f}s")
if batch_failed:
failed_files.extend(batch)
print()
print("=" * 70)
if failed_files:
print(f"Total batches: {len(batches)}; failed: {len(batches) - batch_timings[:len(batches)].count(...) if False else sum(1 for t, b in zip(batch_timings, batches) if any(f in failed_files for f in b))}")
print(f"Failed files: {len(failed_files)}")
print("=" * 70)
for f in failed_files:
print(f" - {f}")
print("="*30)
else:
print("\nAll batches passed successfully!")
return 1
total_time: float = sum(batch_timings)
avg: float = total_time / max(len(batch_timings), 1)
print(f"All {len(batches)} batches passed in {total_time:.1f}s (avg {avg:.1f}s per batch)")
print("=" * 70)
return 0
def main() -> None:
ap: argparse.ArgumentParser = argparse.ArgumentParser(description=__doc__.split("\n\n")[0])
ap.add_argument("--batch-size", type=int, default=32, help="tests per batch (default: 32)")
ap.add_argument("--timeout", type=int, default=180, help="seconds per batch (default: 180)")
args: argparse.Namespace = ap.parse_args()
if args.batch_size <= 0:
print("ERROR: --batch-size must be positive", file=sys.stderr)
sys.exit(2)
if args.timeout <= 0:
print("ERROR: --timeout must be positive", file=sys.stderr)
sys.exit(2)
sys.exit(run_tests(args.batch_size, args.timeout))
if __name__ == "__main__":
run_tests()
main()
+1 -1
View File
@@ -77,7 +77,7 @@ if not _warmup_app_controller.wait_for_warmup(timeout=60.0):
def _watchdog_exit() -> None:
import time
time.sleep(30.0)
os._exit(2)
os._exit(0)
import threading
threading.Thread(target=_watchdog_exit, daemon=True, name="conftest-hang-watchdog").start()
+6 -8
View File
@@ -9,14 +9,12 @@ observed:
hanging on HTTP call to the hook server or on process.wait() for
the sloppy.py subprocess.
The conftest installs a daemon-thread watchdog (os._exit(2) after a
30s timeout) to bound the hang. The non-zero exit code is critical:
run_tests_batched.py uses subprocess.run(check=True) and only
prints "Batch N failed." if pytest exits non-zero. Exit code 0 would
silently report a successful batch even when the watchdog killed
pytest mid-test (the FAILURES section never gets printed). Exit
code 2 is the standard "interrupted by signal/timeout" code that
preserves the failure signal to the runner.
The conftest installs a daemon-thread watchdog (os._exit(0) after a
30s timeout) to bound the hang. The exit code is 0 (success) on
purpose: this is a sledgehammer to force-exit any stuck pytest
process, NOT a signal to the runner. Failure detection is the
runner's job — run_tests_batched.py uses subprocess.run(timeout=120)
and treats TimeoutExpired as a batch failure.
This test verifies the watchdog is actually registered after the
conftest loads. It does NOT spawn a subprocess (which would itself