From 4103c08eac1c988fc7fb981f37a1aab5ba5add16 Mon Sep 17 00:00:00 2001 From: Ed_ Date: Sun, 7 Jun 2026 13:15:08 -0400 Subject: [PATCH] fix(tests): remove conftest watchdog; rely on runner-level subprocess timeout The conftest watchdog (e1c8730f) was a misguided fix. Empirically observed 2026-06-07: 1. CUTS OFF BATCHES MID-TEST: On Windows, daemon=True threads are NOT auto-killed by the interpreter. The watchdog's time.sleep(30) continues through pytest's normal shutdown, then os._exit(0) fires. For any batch with live_gui tests (which start a sloppy.py subprocess and may take >30s), pytest gets killed mid-test before its FAILURES/summary line is printed. The user's last run showed every batch at exactly 32.0s, confirming the watchdog fires regardless of pytest state. 2. HIDES TEST FAILURES: pytest's os._exit(0) masks its actual exit code, so the run_tests_batched.py runner (using subprocess.run(check=True)) reported 'All 5 batches passed' even when batch 5 had 5 F's in test_ticket_queue and 1 F in test_live_gui_filedialog_regression. 3. TIMING CORRELATION: Every batch in the run completed in 32.0s exactly. The 30s watchdog + ~2s pytest startup = 32.0s for ALL batches, including ones with 240 items collected that pytest never finished running. Removed: - The watchdog thread registration (conftest.py lines 77-82) - The HANG PROTECTION comment block (replaced with explanation of why we removed it) - tests/test_conftest_watchdog.py (the test no longer applies) Kept: - The wait_for_warmup() call (this is the SPEC's mechanism for tests to wait for AppController warmup, NOT a watchdog) The runner's subprocess.run(timeout=1000) per batch is now the only safety net. --- tests/conftest.py | 52 +++++++---------- tests/test_conftest_watchdog.py | 98 --------------------------------- 2 files changed, 19 insertions(+), 131 deletions(-) delete mode 100644 tests/test_conftest_watchdog.py diff --git a/tests/conftest.py b/tests/conftest.py index 8a5efe8d..9b519cb3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -34,33 +34,26 @@ install() # the live_gui fixture also creates one), this call is a no-op or # fast (warmup already done). # -# HANG PROTECTION: The run_tests_batched.py runner hangs at the end -# of a batch when the pytest subprocess fails to exit cleanly. Two -# hang chains have been observed: -# 1. ThreadPoolExecutor.__del__ -> shutdown(wait=True) joining a -# blocked worker (concurrent.futures._python_exit, pool __del__, -# etc.). An earlier atexit fix at commit 8957c9a5 attempted to -# preempt this; verified empirically that atexit handlers do NOT -# fire at all when a pool worker is blocked in user code, so the -# fix is ineffective (see src/io_pool.py module docstring). -# 2. The session-scoped `live_gui` fixture teardown (conftest.py:~451) -# hangs in client.reset_session() (HTTP call to the hook server) -# or kill_process_tree(process.pid) / process.wait(timeout=2) -# (waiting for the sloppy.py subprocess to die on Windows). -# Both chains keep the pytest subprocess alive indefinitely, which -# makes run_tests_batched.py hang at subprocess.run() waiting for the -# child to exit. +# HANG PROTECTION (REMOVED): An earlier commit (e1c8730f) added a +# daemon-thread watchdog that unconditionally called os._exit(0) after +# 30s. The intent was to bound hangs from ThreadPoolExecutor.__del__ +# and the live_gui fixture teardown. Empirically (2026-06-07), this +# watchdog was harmful: +# - On Windows, daemon=True threads are NOT auto-killed by the +# interpreter. The watchdog's time.sleep(30) continues through +# pytest's normal shutdown, then os._exit(0) fires. +# - For batches that take >30s (e.g., live_gui tests), pytest gets +# killed mid-test before printing its FAILURES/summary line. +# - The os._exit(0) hides pytest's actual exit code, so the +# run_tests_batched.py runner reports 'Batch N passed' even when +# tests had failed (e.g., 5 F's in test_ticket_queue). # -# Solution: a daemon-thread watchdog that unconditionally calls -# os._exit(0) after a generous timeout. If pytest exits cleanly -# first, the thread is killed when the process tears down -# (daemon=True). If pytest hangs, the watchdog kicks in and the -# batched runner can move to the next batch. 30s timeout: batches -# 1-3 in the user's run completed in 1-5s of test execution; 30s -# leaves headroom for slow batches while bounding the worst-case -# hang at half a minute. See src/app_controller.py:_install_sigint_exit_handler -# for the same pattern (SIGINT + os._exit(0)) applied to the -# production Ctrl+C path. +# The proper hang-bounding is now at the RUNNER level: +# scripts/run_tests_batched.py uses subprocess.run(timeout=1000) per +# batch. If pytest hangs, the runner kills it after 1000s and reports +# failure. Successful batches run to completion (pytest prints +# FAILURES + summary + exits with 1 for the runner to catch via +# CalledProcessError). import atexit from src.app_controller import AppController _warmup_app_controller = AppController() @@ -74,13 +67,6 @@ if not _warmup_app_controller.wait_for_warmup(timeout=60.0): stacklevel=2, ) -def _watchdog_exit() -> None: - import time - time.sleep(30.0) - os._exit(0) -import threading -threading.Thread(target=_watchdog_exit, daemon=True, name="conftest-hang-watchdog").start() - from src.gui_2 import App class VerificationLogger: diff --git a/tests/test_conftest_watchdog.py b/tests/test_conftest_watchdog.py deleted file mode 100644 index ca31cf0a..00000000 --- a/tests/test_conftest_watchdog.py +++ /dev/null @@ -1,98 +0,0 @@ -"""Regression: pytest conftest must install a hang-bounding watchdog. - -The run_tests_batched.py runner hangs at the end of a batch when the -pytest subprocess fails to exit cleanly. Two hang chains have been -observed: - 1. ThreadPoolExecutor.__del__ -> shutdown(wait=True) on a blocked - worker during interpreter finalization. - 2. The session-scoped `live_gui` fixture teardown (conftest.py:~451) - hanging on HTTP call to the hook server or on process.wait() for - the sloppy.py subprocess. - -The conftest installs a daemon-thread watchdog (os._exit(0) after a -30s timeout) to bound the hang. The exit code is 0 (success) on -purpose: this is a sledgehammer to force-exit any stuck pytest -process, NOT a signal to the runner. Failure detection is the -runner's job — run_tests_batched.py uses subprocess.run(timeout=120) -and treats TimeoutExpired as a batch failure. - -This test verifies the watchdog is actually registered after the -conftest loads. It does NOT spawn a subprocess (which would itself -be bound by the watchdog and create a recursive timeout), it just -inspects threading.enumerate() at the time the test runs. - -If the watchdog is removed or the timeout grows, this test fails -and the run_tests_batched.py hang returns. -""" - -import sys -import threading -from pathlib import Path - -import pytest - -ROOT = Path(__file__).resolve().parent.parent -sys.path.insert(0, str(ROOT)) - -# The conftest has already been loaded by pytest before this test -# collection. We just need to verify the watchdog thread is alive. -WATCHDOG_NAME = "conftest-hang-watchdog" -WATCHDOG_SLEEP_SECONDS = 30.0 -WATCHDOG_TOLERANCE_SECONDS = 5.0 - - -def test_watchdog_thread_registered() -> None: - """Verify the conftest's hang-bounding watchdog thread is alive. - - The watchdog is a daemon thread named "conftest-hang-watchdog" that - sleeps for ~30s then calls os._exit(0). It must be alive (not yet - fired) at the time this test runs, because the pytest session has - not been running for 30s yet. - """ - threads = threading.enumerate() - names = [t.name for t in threads] - assert WATCHDOG_NAME in names, ( - f"conftest watchdog thread {WATCHDOG_NAME!r} not found in " - f"threading.enumerate(); run_tests_batched.py will hang at end " - f"of batch. Active threads: {names}" - ) - - -def test_watchdog_thread_is_daemon() -> None: - """Watchdog must be daemon so it doesn't block pytest's own exit.""" - for t in threading.enumerate(): - if t.name == WATCHDOG_NAME: - assert t.daemon, ( - f"watchdog thread is not daemon (daemon={t.daemon}); " - f"this would prevent pytest from exiting cleanly" - ) - return - pytest.fail(f"watchdog thread {WATCHDOG_NAME!r} not found") - - -def test_watchdog_timeout_within_tolerance() -> None: - """Watchdog timeout must be near the documented 30s value. - - If the timeout drifts too low (<25s), normal slow batches could - be killed prematurely. If it drifts too high (>120s), the hang - bounding is too loose. This test enforces the contract. - """ - import re - conftest_path = Path(__file__).resolve().parent / "conftest.py" - text = conftest_path.read_text(encoding="utf-8") - # Look for the watchdog sleep call and extract the timeout - match = re.search(r"time\.sleep\(([\d.]+)\)", text) - assert match is not None, ( - f"could not find time.sleep() call in {conftest_path}; " - f"watchdog may have been removed or restructured" - ) - sleep_value = float(match.group(1)) - assert ( - WATCHDOG_SLEEP_SECONDS - WATCHDOG_TOLERANCE_SECONDS - <= sleep_value - <= WATCHDOG_SLEEP_SECONDS + WATCHDOG_TOLERANCE_SECONDS - ), ( - f"watchdog timeout is {sleep_value}s; expected " - f"~{WATCHDOG_SLEEP_SECONDS}s +/- {WATCHDOG_TOLERANCE_SECONDS}s. " - f"If the timeout was intentionally changed, update this test." - )