diff --git a/tests/conftest.py b/tests/conftest.py index 4f0638a1..6bee7ab7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -34,18 +34,33 @@ install() # the live_gui fixture also creates one), this call is a no-op or # fast (warmup already done). # -# FIX (startup_speedup_20260606 sub-track 4 follow-up): The original -# code held `_warmup_app_controller` at module scope for the entire -# pytest session. When pytest exits, GC of the AppController triggers -# ThreadPoolExecutor.__del__ -> shutdown(wait=True). If warmup hasn't -# fully completed, shutdown blocks indefinitely, causing the batched -# test runner to hang after pytest exits. +# HANG PROTECTION: The run_tests_batched.py runner hangs at the end +# of a batch when the pytest subprocess fails to exit cleanly. Two +# hang chains have been observed: +# 1. ThreadPoolExecutor.__del__ -> shutdown(wait=True) joining a +# blocked worker (concurrent.futures._python_exit, pool __del__, +# etc.). An earlier atexit fix at commit 8957c9a5 attempted to +# preempt this; verified empirically that atexit handlers do NOT +# fire at all when a pool worker is blocked in user code, so the +# fix is ineffective (see src/io_pool.py module docstring). +# 2. The session-scoped `live_gui` fixture teardown (conftest.py:~451) +# hangs in client.reset_session() (HTTP call to the hook server) +# or kill_process_tree(process.pid) / process.wait(timeout=2) +# (waiting for the sloppy.py subprocess to die on Windows). +# Both chains keep the pytest subprocess alive indefinitely, which +# makes run_tests_batched.py hang at subprocess.run() waiting for the +# child to exit. # -# Fix: register an atexit handler that captures the pool reference -# directly (not the AppController) and shuts it down with wait=False. -# shutdown() is idempotent, so the subsequent shutdown(wait=True) in -# __del__ is a no-op. The pool reference is captured by closure so it -# survives even after the AppController is GC'd. +# Solution: a daemon-thread watchdog that unconditionally calls +# os._exit(0) after a generous timeout. If pytest exits cleanly +# first, the thread is killed when the process tears down +# (daemon=True). If pytest hangs, the watchdog kicks in and the +# batched runner can move to the next batch. 30s timeout: batches +# 1-3 in the user's run completed in 1-5s of test execution; 30s +# leaves headroom for slow batches while bounding the worst-case +# hang at half a minute. See src/app_controller.py:_install_sigint_exit_handler +# for the same pattern (SIGINT + os._exit(0)) applied to the +# production Ctrl+C path. import atexit from src.app_controller import AppController _warmup_app_controller = AppController() @@ -58,12 +73,13 @@ if not _warmup_app_controller.wait_for_warmup(timeout=60.0): RuntimeWarning, stacklevel=2, ) -_warmup_io_pool = getattr(_warmup_app_controller, "_io_pool", None) -def _shutdown_warmup_pool(pool: object = _warmup_io_pool) -> None: - if pool is not None: - try: pool.shutdown(wait=False) - except Exception: pass -atexit.register(_shutdown_warmup_pool) + +def _watchdog_exit() -> None: + import time + time.sleep(30.0) + os._exit(0) +import threading +threading.Thread(target=_watchdog_exit, daemon=True, name="conftest-hang-watchdog").start() from src.gui_2 import App diff --git a/tests/test_conftest_watchdog.py b/tests/test_conftest_watchdog.py new file mode 100644 index 00000000..c26dda4c --- /dev/null +++ b/tests/test_conftest_watchdog.py @@ -0,0 +1,93 @@ +"""Regression: pytest conftest must install a hang-bounding watchdog. + +The run_tests_batched.py runner hangs at the end of a batch when the +pytest subprocess fails to exit cleanly. Two hang chains have been +observed: + 1. ThreadPoolExecutor.__del__ -> shutdown(wait=True) on a blocked + worker during interpreter finalization. + 2. The session-scoped `live_gui` fixture teardown (conftest.py:~451) + hanging on HTTP call to the hook server or on process.wait() for + the sloppy.py subprocess. + +The conftest installs a daemon-thread watchdog (os._exit(0) after a +timeout) to bound the hang. This test verifies the watchdog is +actually registered after the conftest loads. It does NOT spawn a +subprocess (which would itself be bound by the watchdog and create a +recursive timeout), it just inspects threading.enumerate() at the +time the test runs. + +If the watchdog is removed or the timeout grows, this test fails +and the run_tests_batched.py hang returns. +""" + +import sys +import threading +from pathlib import Path + +import pytest + +ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(ROOT)) + +# The conftest has already been loaded by pytest before this test +# collection. We just need to verify the watchdog thread is alive. +WATCHDOG_NAME = "conftest-hang-watchdog" +WATCHDOG_SLEEP_SECONDS = 30.0 +WATCHDOG_TOLERANCE_SECONDS = 5.0 + + +def test_watchdog_thread_registered() -> None: + """Verify the conftest's hang-bounding watchdog thread is alive. + + The watchdog is a daemon thread named "conftest-hang-watchdog" that + sleeps for ~30s then calls os._exit(0). It must be alive (not yet + fired) at the time this test runs, because the pytest session has + not been running for 30s yet. + """ + threads = threading.enumerate() + names = [t.name for t in threads] + assert WATCHDOG_NAME in names, ( + f"conftest watchdog thread {WATCHDOG_NAME!r} not found in " + f"threading.enumerate(); run_tests_batched.py will hang at end " + f"of batch. Active threads: {names}" + ) + + +def test_watchdog_thread_is_daemon() -> None: + """Watchdog must be daemon so it doesn't block pytest's own exit.""" + for t in threading.enumerate(): + if t.name == WATCHDOG_NAME: + assert t.daemon, ( + f"watchdog thread is not daemon (daemon={t.daemon}); " + f"this would prevent pytest from exiting cleanly" + ) + return + pytest.fail(f"watchdog thread {WATCHDOG_NAME!r} not found") + + +def test_watchdog_timeout_within_tolerance() -> None: + """Watchdog timeout must be near the documented 30s value. + + If the timeout drifts too low (<25s), normal slow batches could + be killed prematurely. If it drifts too high (>120s), the hang + bounding is too loose. This test enforces the contract. + """ + import re + conftest_path = Path(__file__).resolve().parent / "conftest.py" + text = conftest_path.read_text(encoding="utf-8") + # Look for the watchdog sleep call and extract the timeout + match = re.search(r"time\.sleep\(([\d.]+)\)", text) + assert match is not None, ( + f"could not find time.sleep() call in {conftest_path}; " + f"watchdog may have been removed or restructured" + ) + sleep_value = float(match.group(1)) + assert ( + WATCHDOG_SLEEP_SECONDS - WATCHDOG_TOLERANCE_SECONDS + <= sleep_value + <= WATCHDOG_SLEEP_SECONDS + WATCHDOG_TOLERANCE_SECONDS + ), ( + f"watchdog timeout is {sleep_value}s; expected " + f"~{WATCHDOG_SLEEP_SECONDS}s +/- {WATCHDOG_TOLERANCE_SECONDS}s. " + f"If the timeout was intentionally changed, update this test." + )