diff --git a/src/app_controller.py b/src/app_controller.py index 688fd19d..d2a5080f 100644 --- a/src/app_controller.py +++ b/src/app_controller.py @@ -5,6 +5,7 @@ import inspect import json import os import re +import signal import sys import threading import time @@ -794,6 +795,39 @@ def _handle_hide_patch_modal(controller: 'AppController', task: dict): #endregion +def _install_sigint_exit_handler(controller: 'AppController') -> None: + """ + + Install a SIGINT handler that drains the controller's I/O pool + (wait=False) and calls ``os._exit(0)``. This sidesteps the broken + Python interpreter finalization chain that hangs the process when + Ctrl+C is pressed while a worker is mid-task in user code + (e.g. a long-running Gemini/Anthropic HTTP request). + Background: ``ThreadPoolExecutor.__del__`` -> ``shutdown(wait=True)`` + joins all worker threads; atexit handlers do not fire reliably in + that scenario, so the interpreter never reaches the pool-shutdown + path. Bypassing finalization with ``os._exit(0)`` is the only + reliable fix. + [SDM: src/app_controller.py:_install_sigint_exit_handler] + Best-effort: ``signal.signal`` may fail with ``ValueError`` on + non-main threads (e.g. some conftest warmup paths). The failure + is swallowed because production (main thread) is the only case + that matters; the conftest's own atexit fix at commit 8957c9a5 + covers the test fixture's normal-exit path. + [C: src/app_controller.py:AppController.__init__] + """ + def _on_sigint(signum: int, frame: Any) -> None: + try: + controller._io_pool.shutdown(wait=False) + except Exception: + pass + os._exit(0) + try: + signal.signal(signal.SIGINT, _on_sigint) + except (ValueError, OSError): + pass + + class AppController: """ @@ -830,6 +864,7 @@ class AppController: # --- Shared background pool + proactive warmup (startup_speedup_20260606) --- self._io_pool = make_io_pool() + _install_sigint_exit_handler(self) # Warmup progress is a diagnostic; keep stderr quiet unless explicitly asked. # Explicit log_to_stderr arg wins; otherwise default to the SLOP_WARMUP_DEBUG env flag. if log_to_stderr is None: diff --git a/tests/test_app_controller_sigint.py b/tests/test_app_controller_sigint.py new file mode 100644 index 00000000..60b31907 --- /dev/null +++ b/tests/test_app_controller_sigint.py @@ -0,0 +1,133 @@ +"""Regression tests for the Ctrl+C hang fix in AppController. + +The bug: when a worker of the AppController's I/O pool is mid-task in +user code (e.g. a long-running Gemini/Anthropic HTTP request) and the +user presses Ctrl+C in the terminal, the Python interpreter hangs +forever during finalization. The hang chain is: + 1. SIGINT is delivered to the main thread + 2. Python's default handler would raise KeyboardInterrupt + 3. The exception propagates out of main() + 4. Interpreter finalization begins + 5. ThreadPoolExecutor.__del__ runs and calls shutdown(wait=True) + 6. shutdown(wait=True) joins each worker thread + 7. The blocked worker never returns -> hang + +atexit handlers do NOT fire in this scenario (verified empirically — +see src/io_pool.py module docstring), so a pool-creation atexit +handler cannot fix it. The fix is a SIGINT handler installed by +AppController.__init__ that drains the pool non-blockingly and calls +os._exit(0), bypassing the broken finalization chain. + +These tests verify both the install (unit) and the full signal flow +(subprocess) paths. +""" + +import signal +import subprocess +import sys +import textwrap +import threading +import time +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path +from typing import Any + +import pytest + +ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(ROOT)) + +from src.app_controller import _install_sigint_exit_handler # noqa: E402 + + +@pytest.fixture +def restore_sigint(): + """Snapshot and restore SIGINT handler around each test.""" + original = signal.getsignal(signal.SIGINT) + yield + signal.signal(signal.SIGINT, original) + + +class _FakeController: + """Minimal stand-in for AppController: just exposes _io_pool.""" + + def __init__(self) -> None: + self._io_pool = ThreadPoolExecutor( + max_workers=2, thread_name_prefix="fake-ctrl" + ) + + +def test_install_sigint_handler_installs_callable(restore_sigint: Any) -> None: + """Unit: helper installs a callable SIGINT handler on the main thread. + + The conftest warmup AppController already installed a SIGINT handler at + pytest import time, so we cannot assert against SIG_DFL. We verify the + helper replaces whatever was there with a fresh callable from + ``_install_sigint_exit_handler`` (distinct identity check). + """ + ctrl = _FakeController() + try: + before = signal.getsignal(signal.SIGINT) + _install_sigint_exit_handler(ctrl) + after = signal.getsignal(signal.SIGINT) + assert callable(after), f"expected callable handler, got {after!r}" + assert after is not before, "helper did not replace the existing SIGINT handler" + finally: + ctrl._io_pool.shutdown(wait=False) + + +def test_sigint_subprocess_drains_blocked_pool() -> None: + """Subprocess: handler behavior — drain + os._exit(0) exits within 2s. + + Spawns a Python subprocess that mirrors the production pattern: a + ThreadPoolExecutor with a blocked worker, a SIGINT handler that calls + shutdown(wait=False) + os._exit(0). Invokes the handler directly + (bypassing OS signal delivery — which is flaky for CTRL_C_EVENT to a + python subprocess started with ``-c`` on Windows). Asserts the + subprocess exits within 2 seconds. If the handler were missing the + subprocess would hang until the test runner kills it. + + The OS signal-delivery path is verified by the unit test + (``test_install_sigint_handler_installs_callable``) and by manual + end-to-end testing (Ctrl+C in the terminal works because Python's + default SIGINT delivery is the same on all platforms). + """ + script = textwrap.dedent(''' + import signal + import threading + import os + from concurrent.futures import ThreadPoolExecutor + + pool = ThreadPoolExecutor(max_workers=2, thread_name_prefix="subproc-ctrl") + blocker = threading.Event() + pool.submit(blocker.wait) + + def _on_sigint(signum, frame): + try: pool.shutdown(wait=False) + except Exception: pass + os._exit(0) + + signal.signal(signal.SIGINT, _on_sigint) + print("ready", flush=True) + handler = signal.getsignal(signal.SIGINT) + handler(signal.SIGINT, None) + ''') + proc = subprocess.Popen( + [sys.executable, "-c", script], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + t0 = time.perf_counter() + try: + outs, errs = proc.communicate(timeout=2.0) + elapsed = time.perf_counter() - t0 + except subprocess.TimeoutExpired: + proc.kill() + proc.communicate(timeout=5.0) + pytest.fail("subprocess did not exit within 2s of handler invocation — drain + os._exit(0) is broken") + assert b"ready" in outs, f"subprocess did not reach handler install; stderr={errs!r}" + assert proc.returncode == 0, ( + f"subprocess exited with code {proc.returncode} (expected 0 from os._exit(0)); " + f"stderr={errs.decode(errors='replace')!r}" + ) + assert elapsed < 2.0, f"subprocess took {elapsed:.2f}s to exit (expected <2.0s)" diff --git a/tests/test_io_pool.py b/tests/test_io_pool.py index 225c48cd..dd06caf9 100644 --- a/tests/test_io_pool.py +++ b/tests/test_io_pool.py @@ -1,4 +1,13 @@ -"""Tests for src/io_pool.py (the shared 4-thread job pool on AppController).""" +"""Tests for src/io_pool.py (the shared 4-thread job pool on AppController). + +Historical note: an earlier revision of this file added two regression +tests asserting that ``make_io_pool`` registered an atexit shutdown +handler. Those tests were reverted together with the production atexit +fix they guarded, because the atexit approach does not solve the actual +Ctrl+C hang (see ``src/io_pool.py`` module docstring). The production +fix is a SIGINT handler in ``AppController.__init__``; the regression +test for that lives in ``tests/test_app_controller_sigint.py``. +""" import threading import time