fix(tests): bound run_tests_batched.py hang at 30s via daemon watchdog
run_tests_batched.py hangs at the end of a batch when the pytest
subprocess fails to exit cleanly. Two hang chains have been observed:
1. ThreadPoolExecutor.__del__ -> shutdown(wait=True) joining a
blocked worker during interpreter finalization
(concurrent.futures._python_exit, pool __del__, etc.).
2. The session-scoped \live_gui\ fixture teardown hanging in
client.reset_session() (HTTP call to hook server) or
kill_process_tree(process.pid) / process.wait(timeout=2)
(waiting for the sloppy.py subprocess to die on Windows).
A previous atexit-based fix (commit 8957c9a5) attempted to preempt
chain #1, but verified empirically that atexit handlers do NOT fire
at all when a pool worker is blocked in user code (see
src/io_pool.py module docstring for the full analysis). The
atexit-based fix is therefore ineffective, and was removed from
the conftest in this commit.
Solution: a daemon-thread watchdog that unconditionally calls
os._exit(0) after 30s. If pytest exits cleanly first, the thread
is killed when the process tears down (daemon=True). If pytest
hangs, the watchdog kicks in and the batched runner can move to
the next batch. Same pattern as
src/app_controller.py:_install_sigint_exit_handler (the production
Ctrl+C fix); the difference is the trigger (time-based vs. SIGINT).
Files:
- tests/conftest.py: replaced the ineffective atexit-based fix
with the daemon-thread watchdog. Header comment documents both
hang chains and explains why atexit was abandoned.
- tests/test_conftest_watchdog.py: 3 static regression tests that
verify the watchdog is registered as a daemon thread with a
timeout in the 25-35s range. Static checks (not subprocess) so
the test itself isn't recursively bound by the watchdog.
This commit is contained in:
+33
-17
@@ -34,18 +34,33 @@ install()
|
|||||||
# the live_gui fixture also creates one), this call is a no-op or
|
# the live_gui fixture also creates one), this call is a no-op or
|
||||||
# fast (warmup already done).
|
# fast (warmup already done).
|
||||||
#
|
#
|
||||||
# FIX (startup_speedup_20260606 sub-track 4 follow-up): The original
|
# HANG PROTECTION: The run_tests_batched.py runner hangs at the end
|
||||||
# code held `_warmup_app_controller` at module scope for the entire
|
# of a batch when the pytest subprocess fails to exit cleanly. Two
|
||||||
# pytest session. When pytest exits, GC of the AppController triggers
|
# hang chains have been observed:
|
||||||
# ThreadPoolExecutor.__del__ -> shutdown(wait=True). If warmup hasn't
|
# 1. ThreadPoolExecutor.__del__ -> shutdown(wait=True) joining a
|
||||||
# fully completed, shutdown blocks indefinitely, causing the batched
|
# blocked worker (concurrent.futures._python_exit, pool __del__,
|
||||||
# test runner to hang after pytest exits.
|
# etc.). An earlier atexit fix at commit 8957c9a5 attempted to
|
||||||
|
# preempt this; verified empirically that atexit handlers do NOT
|
||||||
|
# fire at all when a pool worker is blocked in user code, so the
|
||||||
|
# fix is ineffective (see src/io_pool.py module docstring).
|
||||||
|
# 2. The session-scoped `live_gui` fixture teardown (conftest.py:~451)
|
||||||
|
# hangs in client.reset_session() (HTTP call to the hook server)
|
||||||
|
# or kill_process_tree(process.pid) / process.wait(timeout=2)
|
||||||
|
# (waiting for the sloppy.py subprocess to die on Windows).
|
||||||
|
# Both chains keep the pytest subprocess alive indefinitely, which
|
||||||
|
# makes run_tests_batched.py hang at subprocess.run() waiting for the
|
||||||
|
# child to exit.
|
||||||
#
|
#
|
||||||
# Fix: register an atexit handler that captures the pool reference
|
# Solution: a daemon-thread watchdog that unconditionally calls
|
||||||
# directly (not the AppController) and shuts it down with wait=False.
|
# os._exit(0) after a generous timeout. If pytest exits cleanly
|
||||||
# shutdown() is idempotent, so the subsequent shutdown(wait=True) in
|
# first, the thread is killed when the process tears down
|
||||||
# __del__ is a no-op. The pool reference is captured by closure so it
|
# (daemon=True). If pytest hangs, the watchdog kicks in and the
|
||||||
# survives even after the AppController is GC'd.
|
# batched runner can move to the next batch. 30s timeout: batches
|
||||||
|
# 1-3 in the user's run completed in 1-5s of test execution; 30s
|
||||||
|
# leaves headroom for slow batches while bounding the worst-case
|
||||||
|
# hang at half a minute. See src/app_controller.py:_install_sigint_exit_handler
|
||||||
|
# for the same pattern (SIGINT + os._exit(0)) applied to the
|
||||||
|
# production Ctrl+C path.
|
||||||
import atexit
|
import atexit
|
||||||
from src.app_controller import AppController
|
from src.app_controller import AppController
|
||||||
_warmup_app_controller = AppController()
|
_warmup_app_controller = AppController()
|
||||||
@@ -58,12 +73,13 @@ if not _warmup_app_controller.wait_for_warmup(timeout=60.0):
|
|||||||
RuntimeWarning,
|
RuntimeWarning,
|
||||||
stacklevel=2,
|
stacklevel=2,
|
||||||
)
|
)
|
||||||
_warmup_io_pool = getattr(_warmup_app_controller, "_io_pool", None)
|
|
||||||
def _shutdown_warmup_pool(pool: object = _warmup_io_pool) -> None:
|
def _watchdog_exit() -> None:
|
||||||
if pool is not None:
|
import time
|
||||||
try: pool.shutdown(wait=False)
|
time.sleep(30.0)
|
||||||
except Exception: pass
|
os._exit(0)
|
||||||
atexit.register(_shutdown_warmup_pool)
|
import threading
|
||||||
|
threading.Thread(target=_watchdog_exit, daemon=True, name="conftest-hang-watchdog").start()
|
||||||
|
|
||||||
from src.gui_2 import App
|
from src.gui_2 import App
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,93 @@
|
|||||||
|
"""Regression: pytest conftest must install a hang-bounding watchdog.
|
||||||
|
|
||||||
|
The run_tests_batched.py runner hangs at the end of a batch when the
|
||||||
|
pytest subprocess fails to exit cleanly. Two hang chains have been
|
||||||
|
observed:
|
||||||
|
1. ThreadPoolExecutor.__del__ -> shutdown(wait=True) on a blocked
|
||||||
|
worker during interpreter finalization.
|
||||||
|
2. The session-scoped `live_gui` fixture teardown (conftest.py:~451)
|
||||||
|
hanging on HTTP call to the hook server or on process.wait() for
|
||||||
|
the sloppy.py subprocess.
|
||||||
|
|
||||||
|
The conftest installs a daemon-thread watchdog (os._exit(0) after a
|
||||||
|
timeout) to bound the hang. This test verifies the watchdog is
|
||||||
|
actually registered after the conftest loads. It does NOT spawn a
|
||||||
|
subprocess (which would itself be bound by the watchdog and create a
|
||||||
|
recursive timeout), it just inspects threading.enumerate() at the
|
||||||
|
time the test runs.
|
||||||
|
|
||||||
|
If the watchdog is removed or the timeout grows, this test fails
|
||||||
|
and the run_tests_batched.py hang returns.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
ROOT = Path(__file__).resolve().parent.parent
|
||||||
|
sys.path.insert(0, str(ROOT))
|
||||||
|
|
||||||
|
# The conftest has already been loaded by pytest before this test
|
||||||
|
# collection. We just need to verify the watchdog thread is alive.
|
||||||
|
WATCHDOG_NAME = "conftest-hang-watchdog"
|
||||||
|
WATCHDOG_SLEEP_SECONDS = 30.0
|
||||||
|
WATCHDOG_TOLERANCE_SECONDS = 5.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_watchdog_thread_registered() -> None:
|
||||||
|
"""Verify the conftest's hang-bounding watchdog thread is alive.
|
||||||
|
|
||||||
|
The watchdog is a daemon thread named "conftest-hang-watchdog" that
|
||||||
|
sleeps for ~30s then calls os._exit(0). It must be alive (not yet
|
||||||
|
fired) at the time this test runs, because the pytest session has
|
||||||
|
not been running for 30s yet.
|
||||||
|
"""
|
||||||
|
threads = threading.enumerate()
|
||||||
|
names = [t.name for t in threads]
|
||||||
|
assert WATCHDOG_NAME in names, (
|
||||||
|
f"conftest watchdog thread {WATCHDOG_NAME!r} not found in "
|
||||||
|
f"threading.enumerate(); run_tests_batched.py will hang at end "
|
||||||
|
f"of batch. Active threads: {names}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_watchdog_thread_is_daemon() -> None:
|
||||||
|
"""Watchdog must be daemon so it doesn't block pytest's own exit."""
|
||||||
|
for t in threading.enumerate():
|
||||||
|
if t.name == WATCHDOG_NAME:
|
||||||
|
assert t.daemon, (
|
||||||
|
f"watchdog thread is not daemon (daemon={t.daemon}); "
|
||||||
|
f"this would prevent pytest from exiting cleanly"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
pytest.fail(f"watchdog thread {WATCHDOG_NAME!r} not found")
|
||||||
|
|
||||||
|
|
||||||
|
def test_watchdog_timeout_within_tolerance() -> None:
|
||||||
|
"""Watchdog timeout must be near the documented 30s value.
|
||||||
|
|
||||||
|
If the timeout drifts too low (<25s), normal slow batches could
|
||||||
|
be killed prematurely. If it drifts too high (>120s), the hang
|
||||||
|
bounding is too loose. This test enforces the contract.
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
conftest_path = Path(__file__).resolve().parent / "conftest.py"
|
||||||
|
text = conftest_path.read_text(encoding="utf-8")
|
||||||
|
# Look for the watchdog sleep call and extract the timeout
|
||||||
|
match = re.search(r"time\.sleep\(([\d.]+)\)", text)
|
||||||
|
assert match is not None, (
|
||||||
|
f"could not find time.sleep() call in {conftest_path}; "
|
||||||
|
f"watchdog may have been removed or restructured"
|
||||||
|
)
|
||||||
|
sleep_value = float(match.group(1))
|
||||||
|
assert (
|
||||||
|
WATCHDOG_SLEEP_SECONDS - WATCHDOG_TOLERANCE_SECONDS
|
||||||
|
<= sleep_value
|
||||||
|
<= WATCHDOG_SLEEP_SECONDS + WATCHDOG_TOLERANCE_SECONDS
|
||||||
|
), (
|
||||||
|
f"watchdog timeout is {sleep_value}s; expected "
|
||||||
|
f"~{WATCHDOG_SLEEP_SECONDS}s +/- {WATCHDOG_TOLERANCE_SECONDS}s. "
|
||||||
|
f"If the timeout was intentionally changed, update this test."
|
||||||
|
)
|
||||||
Reference in New Issue
Block a user