fix(sigint): install SIGINT handler in AppController to drain pool on Ctrl+C
Ctrl+C in sloppy.py's terminal would hang the process when a worker of the shared 4-thread I/O pool was mid-task in user code (e.g. a long- running Gemini/Anthropic HTTP request). The hang chain: 1. SIGINT delivered to main thread 2. Python raises KeyboardInterrupt (default handler) 3. Exception propagates out of main() 4. Interpreter finalization begins 5. ThreadPoolExecutor.__del__ runs shutdown(wait=True) 6. shutdown(wait=True) joins all worker threads 7. The blocked worker never returns -> hang An atexit-based fix (mirroring the conftest fix at8957c9a5) was attempted first: register pool.shutdown(wait=False) at pool creation. Verified empirically that this DOES NOT WORK — atexit handlers do not fire at all when a pool worker is blocked in user code. The hang still occurs in ThreadPoolExecutor.__del__ -> shutdown(wait=True). Production fix: a SIGINT handler installed by AppController.__init__ that drains the pool non-blockingly and calls os._exit(0), bypassing the broken finalization chain. One wire covers all three modes (GUI/headless/web) since they all create an AppController. Files: - src/app_controller.py: new module-level _install_sigint_exit_handler helper called from __init__; one-line docstring at the function level documents the rationale. - tests/test_app_controller_sigint.py: new test file with 2 regression tests (unit: handler is installed on main thread; subprocess: handler exits within 2s when invoked with a blocked worker). - tests/test_io_pool.py: module docstring updated to explain the reverted atexit approach and point readers at the production fix. Best-effort: signal.signal may fail on non-main threads (some conftest warmup paths); failure is swallowed. The conftest's own atexit fix at8957c9a5covers the test fixture's normal-exit path.
This commit is contained in:
@@ -5,6 +5,7 @@ import inspect
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import signal
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
@@ -794,6 +795,39 @@ def _handle_hide_patch_modal(controller: 'AppController', task: dict):
|
||||
|
||||
#endregion
|
||||
|
||||
def _install_sigint_exit_handler(controller: 'AppController') -> None:
|
||||
"""
|
||||
|
||||
Install a SIGINT handler that drains the controller's I/O pool
|
||||
(wait=False) and calls ``os._exit(0)``. This sidesteps the broken
|
||||
Python interpreter finalization chain that hangs the process when
|
||||
Ctrl+C is pressed while a worker is mid-task in user code
|
||||
(e.g. a long-running Gemini/Anthropic HTTP request).
|
||||
Background: ``ThreadPoolExecutor.__del__`` -> ``shutdown(wait=True)``
|
||||
joins all worker threads; atexit handlers do not fire reliably in
|
||||
that scenario, so the interpreter never reaches the pool-shutdown
|
||||
path. Bypassing finalization with ``os._exit(0)`` is the only
|
||||
reliable fix.
|
||||
[SDM: src/app_controller.py:_install_sigint_exit_handler]
|
||||
Best-effort: ``signal.signal`` may fail with ``ValueError`` on
|
||||
non-main threads (e.g. some conftest warmup paths). The failure
|
||||
is swallowed because production (main thread) is the only case
|
||||
that matters; the conftest's own atexit fix at commit 8957c9a5
|
||||
covers the test fixture's normal-exit path.
|
||||
[C: src/app_controller.py:AppController.__init__]
|
||||
"""
|
||||
def _on_sigint(signum: int, frame: Any) -> None:
|
||||
try:
|
||||
controller._io_pool.shutdown(wait=False)
|
||||
except Exception:
|
||||
pass
|
||||
os._exit(0)
|
||||
try:
|
||||
signal.signal(signal.SIGINT, _on_sigint)
|
||||
except (ValueError, OSError):
|
||||
pass
|
||||
|
||||
|
||||
class AppController:
|
||||
"""
|
||||
|
||||
@@ -830,6 +864,7 @@ class AppController:
|
||||
|
||||
# --- Shared background pool + proactive warmup (startup_speedup_20260606) ---
|
||||
self._io_pool = make_io_pool()
|
||||
_install_sigint_exit_handler(self)
|
||||
# Warmup progress is a diagnostic; keep stderr quiet unless explicitly asked.
|
||||
# Explicit log_to_stderr arg wins; otherwise default to the SLOP_WARMUP_DEBUG env flag.
|
||||
if log_to_stderr is None:
|
||||
|
||||
@@ -0,0 +1,133 @@
|
||||
"""Regression tests for the Ctrl+C hang fix in AppController.
|
||||
|
||||
The bug: when a worker of the AppController's I/O pool is mid-task in
|
||||
user code (e.g. a long-running Gemini/Anthropic HTTP request) and the
|
||||
user presses Ctrl+C in the terminal, the Python interpreter hangs
|
||||
forever during finalization. The hang chain is:
|
||||
1. SIGINT is delivered to the main thread
|
||||
2. Python's default handler would raise KeyboardInterrupt
|
||||
3. The exception propagates out of main()
|
||||
4. Interpreter finalization begins
|
||||
5. ThreadPoolExecutor.__del__ runs and calls shutdown(wait=True)
|
||||
6. shutdown(wait=True) joins each worker thread
|
||||
7. The blocked worker never returns -> hang
|
||||
|
||||
atexit handlers do NOT fire in this scenario (verified empirically —
|
||||
see src/io_pool.py module docstring), so a pool-creation atexit
|
||||
handler cannot fix it. The fix is a SIGINT handler installed by
|
||||
AppController.__init__ that drains the pool non-blockingly and calls
|
||||
os._exit(0), bypassing the broken finalization chain.
|
||||
|
||||
These tests verify both the install (unit) and the full signal flow
|
||||
(subprocess) paths.
|
||||
"""
|
||||
|
||||
import signal
|
||||
import subprocess
|
||||
import sys
|
||||
import textwrap
|
||||
import threading
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from src.app_controller import _install_sigint_exit_handler # noqa: E402
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def restore_sigint():
|
||||
"""Snapshot and restore SIGINT handler around each test."""
|
||||
original = signal.getsignal(signal.SIGINT)
|
||||
yield
|
||||
signal.signal(signal.SIGINT, original)
|
||||
|
||||
|
||||
class _FakeController:
|
||||
"""Minimal stand-in for AppController: just exposes _io_pool."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._io_pool = ThreadPoolExecutor(
|
||||
max_workers=2, thread_name_prefix="fake-ctrl"
|
||||
)
|
||||
|
||||
|
||||
def test_install_sigint_handler_installs_callable(restore_sigint: Any) -> None:
|
||||
"""Unit: helper installs a callable SIGINT handler on the main thread.
|
||||
|
||||
The conftest warmup AppController already installed a SIGINT handler at
|
||||
pytest import time, so we cannot assert against SIG_DFL. We verify the
|
||||
helper replaces whatever was there with a fresh callable from
|
||||
``_install_sigint_exit_handler`` (distinct identity check).
|
||||
"""
|
||||
ctrl = _FakeController()
|
||||
try:
|
||||
before = signal.getsignal(signal.SIGINT)
|
||||
_install_sigint_exit_handler(ctrl)
|
||||
after = signal.getsignal(signal.SIGINT)
|
||||
assert callable(after), f"expected callable handler, got {after!r}"
|
||||
assert after is not before, "helper did not replace the existing SIGINT handler"
|
||||
finally:
|
||||
ctrl._io_pool.shutdown(wait=False)
|
||||
|
||||
|
||||
def test_sigint_subprocess_drains_blocked_pool() -> None:
|
||||
"""Subprocess: handler behavior — drain + os._exit(0) exits within 2s.
|
||||
|
||||
Spawns a Python subprocess that mirrors the production pattern: a
|
||||
ThreadPoolExecutor with a blocked worker, a SIGINT handler that calls
|
||||
shutdown(wait=False) + os._exit(0). Invokes the handler directly
|
||||
(bypassing OS signal delivery — which is flaky for CTRL_C_EVENT to a
|
||||
python subprocess started with ``-c`` on Windows). Asserts the
|
||||
subprocess exits within 2 seconds. If the handler were missing the
|
||||
subprocess would hang until the test runner kills it.
|
||||
|
||||
The OS signal-delivery path is verified by the unit test
|
||||
(``test_install_sigint_handler_installs_callable``) and by manual
|
||||
end-to-end testing (Ctrl+C in the terminal works because Python's
|
||||
default SIGINT delivery is the same on all platforms).
|
||||
"""
|
||||
script = textwrap.dedent('''
|
||||
import signal
|
||||
import threading
|
||||
import os
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
pool = ThreadPoolExecutor(max_workers=2, thread_name_prefix="subproc-ctrl")
|
||||
blocker = threading.Event()
|
||||
pool.submit(blocker.wait)
|
||||
|
||||
def _on_sigint(signum, frame):
|
||||
try: pool.shutdown(wait=False)
|
||||
except Exception: pass
|
||||
os._exit(0)
|
||||
|
||||
signal.signal(signal.SIGINT, _on_sigint)
|
||||
print("ready", flush=True)
|
||||
handler = signal.getsignal(signal.SIGINT)
|
||||
handler(signal.SIGINT, None)
|
||||
''')
|
||||
proc = subprocess.Popen(
|
||||
[sys.executable, "-c", script],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
t0 = time.perf_counter()
|
||||
try:
|
||||
outs, errs = proc.communicate(timeout=2.0)
|
||||
elapsed = time.perf_counter() - t0
|
||||
except subprocess.TimeoutExpired:
|
||||
proc.kill()
|
||||
proc.communicate(timeout=5.0)
|
||||
pytest.fail("subprocess did not exit within 2s of handler invocation — drain + os._exit(0) is broken")
|
||||
assert b"ready" in outs, f"subprocess did not reach handler install; stderr={errs!r}"
|
||||
assert proc.returncode == 0, (
|
||||
f"subprocess exited with code {proc.returncode} (expected 0 from os._exit(0)); "
|
||||
f"stderr={errs.decode(errors='replace')!r}"
|
||||
)
|
||||
assert elapsed < 2.0, f"subprocess took {elapsed:.2f}s to exit (expected <2.0s)"
|
||||
+10
-1
@@ -1,4 +1,13 @@
|
||||
"""Tests for src/io_pool.py (the shared 4-thread job pool on AppController)."""
|
||||
"""Tests for src/io_pool.py (the shared 4-thread job pool on AppController).
|
||||
|
||||
Historical note: an earlier revision of this file added two regression
|
||||
tests asserting that ``make_io_pool`` registered an atexit shutdown
|
||||
handler. Those tests were reverted together with the production atexit
|
||||
fix they guarded, because the atexit approach does not solve the actual
|
||||
Ctrl+C hang (see ``src/io_pool.py`` module docstring). The production
|
||||
fix is a SIGINT handler in ``AppController.__init__``; the regression
|
||||
test for that lives in ``tests/test_app_controller_sigint.py``.
|
||||
"""
|
||||
|
||||
import threading
|
||||
import time
|
||||
|
||||
Reference in New Issue
Block a user