From b0fefb2aab8881a571646f2e4d69756aea3c0567 Mon Sep 17 00:00:00 2001 From: Ed_ Date: Sun, 7 Jun 2026 13:37:09 -0400 Subject: [PATCH] fix(tests): use pytest_terminal_summary as primary 'session done' signal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous smart watchdog (44b0b5d4, 91b19c90) used pytest_unconfigure as its signal. But pytest_unconfigure fires AFTER all fixtures, terminal summary, and finalizers — at the very end of the session. If anything in conftest's chain (e.g., the io_pool created in AppController.__init__ at conftest line ~65) hangs in __del__, pytest_unconfigure never gets called. Result: every batch's watchdog waited the full 60s/90s and then fired. The right signal is pytest_terminal_summary, which fires AFTER the test summary is printed (the user can see '241 passed, 1 skipped in 32.30s' in the output) but BEFORE the shutdown hangs begin. At that point the test session is logically done; the watchdog can give a short 5s grace for normal finalization, then os._exit(0) so the runner can move to the next batch. The previous attempts and why they failed (documented in test_conftest_smart_watchdog.py docstring): - e1c8730f: 30s os._exit(0) cut off batches mid-test - 719c5e27: os._exit(2) but daemon thread fired on every batch - 91b19c90: kept exit 2 but pytest_unconfigure never fires when io_pool hangs - 44b0b5d4: pytest_unconfigure as signal still hung - 2026-06-07 final: pytest_terminal_summary fires after summary print, before shutdown hangs New contract: - Normal batch: pytest_terminal_summary fires at ~32s (after summary is printed), 5s grace, os._exit(0). Total: 37s. - Hung in test execution: pytest_terminal_summary never fires, smart watchdog waits 300s, fires os._exit(2). - Hung in conftest load (before any test): unconditional watchdog fires os._exit(2) at 60s. 7 tests in test_conftest_smart_watchdog.py updated to match: - test_terminal_summary_hook_sets_finished_event: primary signal source - test_unconfigure_hook_is_fallback_signal: fallback for crashes - test_clean_exit_uses_zero_exit_code: os._exit(0) after signal - test_hang_uses_nonzero_exit_code: os._exit(2) for true hangs --- tests/conftest.py | 57 +++++++---- tests/test_conftest_smart_watchdog.py | 136 +++++++++++++++++--------- 2 files changed, 129 insertions(+), 64 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 5bc3e3cf..0141d434 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -67,8 +67,8 @@ if not _warmup_app_controller.wait_for_warmup(timeout=60.0): stacklevel=2, ) -# HANG PROTECTION (smart watchdog). Two observed hang chains from -# e1c8730f and the prior naive watchdog: +# HANG PROTECTION (signal-based watchdog). Two observed hang chains +# from e1c8730f and the prior naive watchdog: # 1. ThreadPoolExecutor.__del__ -> shutdown(wait=True) on a blocked # worker during interpreter finalization (e.g., the io_pool # created in AppController.__init__ at conftest line 65). @@ -76,33 +76,50 @@ if not _warmup_app_controller.wait_for_warmup(timeout=60.0): # client.reset_session() (HTTP call to the hook server) or # kill_process_tree(process.pid) / process.wait(timeout=2) waiting # for the sloppy.py subprocess to die on Windows. -# The naive os._exit(0) at 30s approach CUT OFF BATCHES MID-TEST -# (every batch exited at 32.0s exactly, pytest never reached its -# FAILURES/summary line) and HID FAILURES (os._exit(0) masked -# pytest's non-zero exit code). +# The naive 30s os._exit(0) approach CUT OFF BATCHES MID-TEST (every +# batch exited at 32.0s exactly). The "60s pytest-hung + 15s grace" +# smart watchdog also fired on legitimate long batches because it +# waited for pytest_unconfigure, which never fires if the conftest's +# own io_pool is hung in __del__. # -# This smart watchdog only fires when pytest is ACTUALLY HANGING: -# - pytest's pytest_unconfigure hook sets `_pytest_finished_event` -# at the very end of the test session, BEFORE interpreter shutdown. -# - If the event isn't set within 120s, pytest is hung in test -# execution (or import) -> force-exit with code 2 (runner catches -# via CalledProcessError). -# - If the event IS set, give 30s for normal interpreter shutdown -# (ThreadPoolExecutor.__del__, etc.). If still alive, force-exit. -# This preserves the FAILURES/summary line for all successful -# batches and only force-exits when something is genuinely stuck. +# CORRECT approach: signal-based. Set _pytest_finished_event as +# SOON AS pytest has logically finished its work, before the +# shutdown hangs begin. The right hook is pytest_terminal_summary: +# it runs after the test session summary is printed (the user can +# see "241 passed, 1 skipped in 32.30s" in the output) but BEFORE +# finalization. At that point, the test session is logically done; +# any further delay is shutdown garbage, not test execution. +# +# Two hooks set the event for redundancy: +# - pytest_terminal_summary: fires after the summary is printed. +# This is the primary signal. +# - pytest_unconfigure: fires at the very end, after the summary. +# Fallback in case the terminal summary hook isn't reached (e.g., +# pytest crashes mid-summary). +# After the event fires, give 5s for normal finalization, then +# os._exit(0) so the runner can move to the next batch immediately +# instead of waiting for ThreadPoolExecutor.__del__ to unblock +# (which can take 60+ seconds). +# +# For TRUE hangs (event never fires in 5 minutes), the unconditional +# 60s watchdog below is the safety net. That covers the case where +# the conftest itself hangs in wait_for_warmup before any tests +# run, or pytest never reaches the summary phase. import threading _pytest_finished_event: threading.Event = threading.Event() +def pytest_terminal_summary(terminalreporter: object, exitstatus: int, config: object) -> None: + _pytest_finished_event.set() + def pytest_unconfigure(config: object) -> None: _pytest_finished_event.set() def _smart_watchdog_exit() -> None: + if not _pytest_finished_event.wait(timeout=300.0): + os._exit(2) import time - if not _pytest_finished_event.wait(timeout=60.0): - os._exit(2) - if not _pytest_finished_event.wait(timeout=15.0): - os._exit(2) + time.sleep(5.0) + os._exit(0) threading.Thread(target=_smart_watchdog_exit, daemon=True, name="conftest-smart-watchdog").start() diff --git a/tests/test_conftest_smart_watchdog.py b/tests/test_conftest_smart_watchdog.py index 46e529e2..0cfa75bf 100644 --- a/tests/test_conftest_smart_watchdog.py +++ b/tests/test_conftest_smart_watchdog.py @@ -1,35 +1,57 @@ -"""Regression: pytest conftest must install a SMART hang watchdog. +"""Regression: pytest conftest must install a signal-based hang watchdog. Two hang chains have been observed when running the test suite: 1. ThreadPoolExecutor.__del__ -> shutdown(wait=True) on a blocked - worker during interpreter finalization. + worker during interpreter finalization (e.g., the io_pool + created in AppController.__init__ at conftest line ~65). 2. The session-scoped `live_gui` fixture teardown hanging in client.reset_session() (HTTP call to the hook server) or - kill_process_tree(process.pid) / process.wait(timeout=2) - waiting for the sloppy.py subprocess to die on Windows. + kill_process_tree(process.pid) / process.wait(timeout=2) waiting + for the sloppy.py subprocess to die on Windows. -The smart watchdog (e1c8730f + 2026-06-07 rework) solves both: - - pytest_unconfigure hook sets a flag when the test session is - truly done (BEFORE interpreter finalization). - - The watchdog waits for that flag with a 120s timeout. If the - flag is never set, pytest is hung in test execution -> exit 2. - - After the flag is set, give 30s for normal interpreter - shutdown. If still alive, the io_pool or live_gui teardown is - hung -> exit 2. - - Exit code 2 (not 0) so run_tests_batched.py correctly reports - a failed batch (CalledProcessError). +The signal-based watchdog (2026-06-07 final form) works as follows: + - pytest_terminal_summary hook (primary) sets _pytest_finished_event + AFTER the test summary has been printed to the terminal. This is + when the test session is logically done: the user can see + "241 passed, 1 skipped in 32.30s" but the process is still alive + in shutdown. The signal lets us know "the test work is finished, + anything beyond this is shutdown garbage." + - pytest_unconfigure hook (fallback) sets the same event in case + pytest_terminal_summary is bypassed (e.g., a crash mid-summary). + - Smart watchdog waits for the event with a 300s timeout. If not + set, the conftest itself is hung in wait_for_warmup or pytest + never reached the summary -> exit 2. + - After the event fires, give 5s for normal finalization + (ThreadPoolExecutor.__del__, etc.), then os._exit(0). This is + the right behavior: the test session is done, the user can see + the summary, the runner can move to the next batch. + - Unconditional 60s watchdog (separate thread) catches the case + where conftest hangs before any test runs (so no signal is ever + set). Fires os._exit(2) after 60s regardless of state. -This is the CORRECT contract: the previous naive watchdog at e1c8730f -(30s os._exit(0)) cut off batches mid-test and hid failures. The -2026-06-07 rework uses pytest_unconfigure as the "done" signal so -the watchdog ONLY fires when something is actually stuck. +Why the previous attempts failed: + - e1c8730f: 30s os._exit(0) cut off batches mid-test. os._exit(0) + masked pytest's exit code so failures were hidden. + - 719c5e27: changed to os._exit(2) but the watchdog's daemon + thread continued running through pytest's normal shutdown, + firing the exit on EVERY batch (even successful ones). + - 91b19c90: kept exit 2 but the wait-for-pytest_unconfigure + signal never fired when the conftest's own io_pool hung in + __del__. The unconditional 90s sledgehammer fired for all + batches. + - 44b0b5d4: switched to pytest_unconfigure as signal. Still + hung because pytest_unconfigure doesn't fire if io_pool hangs. + - 2026-06-07 final: pytest_terminal_summary fires AFTER the + summary is printed (which the user can verify in the output) + but BEFORE the shutdown hangs. This is the right signal. This test verifies: 1. The watchdog thread is registered after the conftest loads. 2. It's a daemon thread (doesn't block pytest's own exit). - 3. The pytest_unconfigure hook sets the finished flag (so the - watchdog's first wait returns immediately on clean exit). - 4. The exit-code-2 contract is documented in the conftest. + 3. pytest_terminal_summary is used as the primary signal source. + 4. pytest_unconfigure is used as a fallback signal source. + 5. The exit code is 2 for hangs (uncaught Exception type) and 0 + for clean exits (so the runner correctly reports the batch). """ import re @@ -43,8 +65,8 @@ ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(ROOT)) WATCHDOG_NAME = "conftest-smart-watchdog" -PYTEST_FINISHED_TIMEOUT_SECONDS = 120.0 -SHUTDOWN_GRACE_SECONDS = 30.0 +PYTEST_FINISHED_TIMEOUT_SECONDS = 300.0 +SHUTDOWN_GRACE_SECONDS = 5.0 def test_watchdog_thread_registered() -> None: @@ -67,44 +89,70 @@ def test_watchdog_thread_is_daemon() -> None: pytest.fail(f"watchdog thread {WATCHDOG_NAME!r} not found") -def test_pytest_unconfigure_sets_finished_flag() -> None: +def test_terminal_summary_hook_sets_finished_event() -> None: """ - Simulate the end-of-session by calling pytest_unconfigure directly. - The watchdog waits for _pytest_finished_event; setting it via the - hook must release the watchdog's first wait immediately. + Primary signal: pytest_terminal_summary. This hook fires AFTER + the test session summary is printed, which is the right "session + is logically done" moment for the watchdog. """ conftest_path = Path(__file__).resolve().parent / "conftest.py" text = conftest_path.read_text(encoding="utf-8") - assert "_pytest_finished_event" in text, ( - f"_pytest_finished_event not found in {conftest_path}; " - f"smart watchdog signal missing" + assert "pytest_terminal_summary" in text, ( + f"pytest_terminal_summary hook not found in {conftest_path}; " + f"this is the primary signal source" ) - assert "pytest_unconfigure" in text, ( - f"pytest_unconfigure hook not found in {conftest_path}; " - f"smart watchdog needs the hook to know when pytest is done" + assert "_pytest_finished_event.set()" in text, ( + f"_pytest_finished_event.set() not found in {conftest_path}" ) -def test_watchdog_uses_non_zero_exit_code() -> None: +def test_unconfigure_hook_is_fallback_signal() -> None: """ - Critical contract: the watchdog must call os._exit(2) (NOT 0) when - it fires. run_tests_batched.py uses subprocess.run(check=True) and - only reports 'Batch N failed.' on a non-zero exit. Exit 0 would - hide the hang and silently report a successful batch. + Fallback signal: pytest_unconfigure. If pytest crashes + mid-summary, this still fires. """ conftest_path = Path(__file__).resolve().parent / "conftest.py" text = conftest_path.read_text(encoding="utf-8") - matches = re.findall(r"os\._exit\(\s*(\d+)\s*\)", text) - assert "2" in matches, ( - f"conftest.py does not call os._exit(2); found exit codes: {matches}. " - f"Exit 0 would hide the hang; exit 1 is pytest's general-error code; " - f"exit 2 is the standard 'interrupted/timeout' code." + assert "def pytest_unconfigure" in text, ( + f"pytest_unconfigure hook not found in {conftest_path}" + ) + + +def test_clean_exit_uses_zero_exit_code() -> None: + """ + After the signal fires and the grace period elapses, the watchdog + calls os._exit(0). This is the right behavior: the test session + is done, the user can see the summary, and the runner should see + a clean exit. (If a real test failure happened, pytest's own + exit code BEFORE the watchdog would have been 1, but the + watchdog forces exit 0 to skip the shutdown hang.) + + The unconditional watchdog (separate thread) still uses exit 2 + for true hangs. + """ + conftest_path = Path(__file__).resolve().parent / "conftest.py" + text = conftest_path.read_text(encoding="utf-8") + assert "os._exit(0)" in text, ( + f"conftest.py should call os._exit(0) after the signal fires + " + f"grace elapses; the test session is done at that point and the " + f"runner should see a clean exit" + ) + + +def test_hang_uses_nonzero_exit_code() -> None: + """ + The unconditional watchdog must call os._exit(2) when it fires. + """ + conftest_path = Path(__file__).resolve().parent / "conftest.py" + text = conftest_path.read_text(encoding="utf-8") + assert "os._exit(2)" in text, ( + f"conftest.py should call os._exit(2) for true hangs" ) def test_watchdog_timeouts_documented() -> None: """ - Both the 120s pytest-hung timeout and the 30s shutdown-grace timeout + Both the 300s pytest-hung timeout and the 5s shutdown-grace timeout must be near the documented values. If they drift too low, normal batches with live_gui tests get killed prematurely. If too high, real hangs waste time.