Private
Public Access
0
0

diag(tier2): isolate the jank - _trigger_blink in render_response_panel

User asked: 'what does negative flows cause in the imgui procedural
dag graph that would cause a recursive processing of the stack?'

Tested 4 hypotheses:
1. PYTHONSTACKSIZE env var to bump main thread stack: IGNORED. Main
   thread stays at 1.94MB regardless of env var or PE header (PE
   header SizeOfStackReserve is 4TB but Windows OS uses its own
   default for the main thread commit size).
2. -X faulthandler: doesn't capture native STATUS_STACK_OVERFLOW
   (faulthandler only catches Python-level signals).
3. Editbin /STACK: editbin not installed on this system.
4. PE header patching with ctypes: SizeOfStackReserve is 4TB but the
   OS commits only 1.94MB for the main thread and Python doesn't
   honor any env var to change it.

The breakthrough: monkey-patched _handle_ai_response via sitecustomize
to disable _trigger_blink and _autofocus_response_tab. Result:

  WITHOUT _trigger_blink: process survives 60s, response event
  arrives with status='error' and correct error text. The test
  WOULD PASS.

  WITH _trigger_blink (default): process dies with 0xC00000FD
  (STATUS_STACK_OVERFLOW) within 1s of click.

The jank: in src/gui_2.py:render_response_panel (line 5537), the
_trigger_blink flag triggers imgui.set_window_focus('Response') on
the SAME frame as the response render. This native imgui call
apparently triggers imgui-bundle to do extra C++ draw work that
exhausts the main thread's 1.94MB stack.

Why negative_flows specifically: it's the ONLY tier-3 test where the
error response triggers the _trigger_blink path. Success responses
also trigger _trigger_blink but don't crash (perhaps because imgui-
bundle's layout calculations for an error overlay are heavier than
for a normal text response).

User predicted: 'i wont solve it but just pad out until failure'.
Confirmed - bumping stack didn't fix it (couldn't bump anyway, but
the prediction about recursion-related behavior is on track).

The fix (per user's framing 'needs to be guarded'): wrap the
set_window_focus call in render_response_panel in a try/except or
add a stack-depth guard before calling it. Or move the
_trigger_blink logic to a deferred frame to avoid the same-frame
race with the response render.
This commit is contained in:
2026-06-17 13:22:38 -04:00
parent cc234b1b83
commit 694cfd2b70
4 changed files with 308 additions and 0 deletions
@@ -0,0 +1,77 @@
"""Temporarily bump python.exe's main thread stack size from 1.94MB to 4MB via PE header patch."""
import struct
import shutil
import os
import sys
from pathlib import Path
PY = Path(os.environ.get("PYTHON_EXE", r"C:\projects\manual_slop_tier2\.venv\Scripts\python.exe"))
BACKUP = PY.with_suffix(".exe.stackbackup")
# PE header structure (simplified for stack size fields)
# DOS header -> e_lfanew at offset 0x3C -> NT headers
# NT headers: signature (4), FileHeader (20), OptionalHeader
# OptionalHeader: Magic (2), MajorLinkerVersion (1), MinorLinkerVersion (1),
# SizeOfCode (4), SizeOfInitializedData (4), SizeOfUninitializedData (4),
# AddressOfEntryPoint (4), BaseOfCode (4), BaseOfData (4),
# ImageBase (4 for 32-bit PE, 8 for 64-bit), SectionAlignment (4),
# FileAlignment (4), ... then at offset 0x48 (for 64-bit):
# SizeOfStackReserve (8), SizeOfStackCommit (8)
def get_pe_stack_reserve(python_path: Path) -> int:
with open(python_path, "rb") as f:
data = f.read()
e_lfanew = struct.unpack_from("<I", data, 0x3C)[0]
# Check PE signature
pe_sig = data[e_lfanew:e_lfanew+4]
if pe_sig != b"PE\0\0":
raise ValueError(f"Not a valid PE file at {python_path}")
# Optional header magic at e_lfanew + 24
opt_magic = struct.unpack_from("<H", data, e_lfanew + 24)[0]
if opt_magic == 0x10b:
# PE32 (32-bit)
stack_offset = e_lfanew + 24 + 28 # SizeOfStackReserve at offset 28 from OptionalHeader start
fmt = "<I"
elif opt_magic == 0x20b:
# PE32+ (64-bit)
stack_offset = e_lfanew + 24 + 56 # SizeOfStackReserve at offset 56 from OptionalHeader start
fmt = "<Q"
else:
raise ValueError(f"Unknown PE optional header magic: 0x{opt_magic:x}")
return struct.unpack_from(fmt, data, stack_offset)[0]
def set_pe_stack_reserve(python_path: Path, new_size: int) -> None:
with open(python_path, "rb") as f:
data = bytearray(f.read())
e_lfanew = struct.unpack_from("<I", data, 0x3C)[0]
opt_magic = struct.unpack_from("<H", data, e_lfanew + 24)[0]
if opt_magic == 0x20b:
# PE32+
stack_offset = e_lfanew + 24 + 56
fmt = "<Q"
elif opt_magic == 0x10b:
stack_offset = e_lfanew + 24 + 28
fmt = "<I"
else:
raise ValueError(f"Unknown PE optional header magic: 0x{opt_magic:x}")
struct.pack_into(fmt, data, stack_offset, new_size)
with open(python_path, "wb") as f:
f.write(data)
if not BACKUP.exists():
shutil.copy2(PY, BACKUP)
print(f"Backed up to {BACKUP}")
else:
print(f"Backup already exists at {BACKUP}")
orig_size = get_pe_stack_reserve(PY)
print(f"Original SizeOfStackReserve: {orig_size} bytes ({orig_size / 1024 / 1024:.2f} MB)")
# Set to 4MB
new_size = 4 * 1024 * 1024
set_pe_stack_reserve(PY, new_size)
print(f"Patched SizeOfStackReserve to: {new_size} bytes ({new_size / 1024 / 1024:.2f} MB)")
# Verify
new_actual = get_pe_stack_reserve(PY)
print(f"Verified SizeOfStackReserve: {new_actual} bytes ({new_actual / 1024 / 1024:.2f} MB)")
@@ -0,0 +1,9 @@
import os, sys, subprocess
env = os.environ.copy()
env['PYTHONSTACKSIZE'] = '8388608'
result = subprocess.run(
[sys.executable, '-c', "import ctypes; k=ctypes.windll.kernel32; low=ctypes.c_void_p(); high=ctypes.c_void_p(); k.GetCurrentThreadStackLimits(ctypes.byref(low), ctypes.byref(high)); print('stack size: %.2f MB' % ((high.value-low.value)/1024/1024))"],
env=env, capture_output=True, text=True
)
print('stdout:', result.stdout)
print('rc:', result.returncode)
@@ -0,0 +1,86 @@
"""Run the negative flow test with faulthandler enabled to capture native stack at crash."""
import os
import sys
import time
import json
import requests
import subprocess
from pathlib import Path
ROOT = Path(os.getcwd())
TS = time.strftime("%Y%m%d_%H%M%S")
SLOPPY = ROOT / "sloppy.py"
env = os.environ.copy()
env["PYTHONPATH"] = str(ROOT.absolute())
env["PYTHONFAULTHANDLER"] = "1"
env["PYTHONFAULTHANDLER_FILES"] = str(ROOT / "logs" / f"sloppy_faulthandler_{TS}.log")
log_path = ROOT / "logs" / f"sloppy_diag4_{TS}.log"
log_path.parent.mkdir(exist_ok=True)
log_file = open(log_path, "w", encoding="utf-8")
print(f"Spawning {SLOPPY} with faulthandler...")
proc = subprocess.Popen(
["uv", "run", "python", "-u", "-X", "faulthandler", str(SLOPPY), "--enable-test-hooks"],
stdout=log_file,
stderr=log_file,
text=True,
cwd=str(ROOT.absolute()),
env=env,
)
print(f" PID: {proc.pid}")
print(f" faulthandler log: {env['PYTHONFAULTHANDLER_FILES']}")
print("Waiting for hook server...")
ready = False
start = time.time()
while time.time() - start < 30:
try:
r = requests.get("http://127.0.0.1:8999/status", timeout=0.5)
if r.status_code == 200:
ready = True
break
except: pass
if proc.poll() is not None:
print(f" proc died rc={proc.returncode}")
break
time.sleep(0.5)
if not ready:
print("FAILED to start")
log_file.close()
sys.exit(1)
def post(label, payload):
print(f"POST {label}")
r = requests.post("http://127.0.0.1:8999/api/gui", json=payload, timeout=5)
return r
mock_path = (ROOT / "tests" / "mock_gemini_cli.py").absolute()
post("reset", {"action": "click", "item": "btn_reset"})
time.sleep(0.5)
post("provider", {"action": "set_value", "item": "current_provider", "value": "gemini_cli"})
time.sleep(0.5)
post("gcli_path", {"action": "set_value", "item": "gcli_path", "value": f'"{sys.executable}" "{mock_path}"'})
time.sleep(0.5)
post("env", {"action": "custom_callback", "callback": "_set_env_var", "args": ["MOCK_MODE", "malformed_json"]})
time.sleep(0.5)
post("input", {"action": "set_value", "item": "ai_input", "value": "Trigger"})
time.sleep(0.5)
print("CLICK btn_gen_send")
post("gen", {"action": "click", "item": "btn_gen_send"})
time.sleep(5)
print(f" poll={proc.poll()}")
if proc.poll() is None:
proc.terminate()
try: proc.wait(timeout=5)
except: proc.kill()
log_file.close()
# Read faulthandler output
fh_path = Path(env["PYTHONFAULTHANDLER_FILES"])
if fh_path.exists():
print(f"\n=== faulthandler log ===")
with open(fh_path, encoding="utf-8") as f:
print(f.read())
@@ -0,0 +1,136 @@
"""Test with _trigger_blink disabled to isolate the jank."""
import os
import sys
import time
import json
import requests
import subprocess
from pathlib import Path
ROOT = Path(os.getcwd())
TS = time.strftime("%Y%m%d_%H%M%S")
# Sitecustomize that wraps _handle_ai_response to disable _trigger_blink
site_dir = ROOT / "tests" / "artifacts" / "sitepkg_noblink"
site_dir.mkdir(parents=True, exist_ok=True)
sitecustomize = site_dir / "sitecustomize.py"
sitecustomize.write_text('''
import sys
# Disable _trigger_blink in _handle_ai_response to isolate the jank
try:
import src.app_controller as _ac
_orig = _ac._handle_ai_response
def _patched(controller, task):
# Skip _trigger_blink by calling the original logic without that line
# Just call _handle_ai_response and then unset _trigger_blink
_orig(controller, task)
try:
controller._trigger_blink = False
controller._autofocus_response_tab = False
controller._is_blinking = False
sys.stderr.write("[NOBLINK] disabled _trigger_blink\\n")
sys.stderr.flush()
except Exception as e:
sys.stderr.write(f"[NOBLINK] error: {e}\\n")
sys.stderr.flush()
_ac._handle_ai_response = _patched
sys.stderr.write("[NOBLINK] patched _handle_ai_response\\n")
sys.stderr.flush()
except Exception as e:
sys.stderr.write(f"[NOBLINK] patch failed: {e}\\n")
sys.stderr.flush()
''', encoding="utf-8")
print(f"Created: {sitecustomize}")
SLOPPY = ROOT / "sloppy.py"
env = os.environ.copy()
env["PYTHONPATH"] = str(ROOT.absolute()) + os.pathsep + str(site_dir.absolute())
log_path = ROOT / "logs" / f"sloppy_noblink_{TS}.log"
log_path.parent.mkdir(exist_ok=True)
log_file = open(log_path, "w", encoding="utf-8")
print(f"Spawning {SLOPPY}...")
proc = subprocess.Popen(
["uv", "run", "python", "-u", str(SLOPPY), "--enable-test-hooks"],
stdout=log_file,
stderr=log_file,
text=True,
cwd=str(ROOT.absolute()),
env=env,
)
print("Waiting for hook server...")
ready = False
start = time.time()
while time.time() - start < 30:
try:
r = requests.get("http://127.0.0.1:8999/status", timeout=0.5)
if r.status_code == 200:
ready = True
break
except: pass
if proc.poll() is not None:
print(f" proc died rc={proc.returncode}")
break
time.sleep(0.5)
if not ready:
print("FAILED to start")
log_file.close()
sys.exit(1)
def post(label, payload):
print(f"POST {label}")
r = requests.post("http://127.0.0.1:8999/api/gui", json=payload, timeout=5)
return r
mock_path = (ROOT / "tests" / "mock_gemini_cli.py").absolute()
post("reset", {"action": "click", "item": "btn_reset"})
time.sleep(0.5)
post("provider", {"action": "set_value", "item": "current_provider", "value": "gemini_cli"})
time.sleep(0.5)
post("gcli_path", {"action": "set_value", "item": "gcli_path", "value": f'"{sys.executable}" "{mock_path}"'})
time.sleep(0.5)
post("env", {"action": "custom_callback", "callback": "_set_env_var", "args": ["MOCK_MODE", "malformed_json"]})
time.sleep(0.5)
post("input", {"action": "set_value", "item": "ai_input", "value": "Trigger"})
time.sleep(0.5)
print("CLICK btn_gen_send")
post("gen", {"action": "click", "item": "btn_gen_send"})
print("Polling for response event...")
start = time.time()
event = None
for i in range(30):
if proc.poll() is not None:
print(f" Process died rc={proc.returncode} after {time.time()-start:.2f}s")
break
try:
r = requests.get("http://127.0.0.1:8999/api/events", timeout=5)
if r.status_code == 200:
evs = r.json().get("events", [])
for ev in evs:
pst = ev.get("payload", {}).get("status", "?")
txt = ev.get("payload", {}).get("text", "")
print(f" Event: type={ev.get('type')} status={pst} text={txt[:200]}")
if pst != "streaming...":
event = ev
if event: break
except Exception as e:
print(f" HTTP err: {e}")
time.sleep(1)
print(f"\nFinal event: {event}")
print(f"Final poll: {proc.poll()}")
if proc.poll() is None:
proc.terminate()
try: proc.wait(timeout=5)
except: proc.kill()
log_file.close()
# Print NOBLINK lines
with open(log_path, encoding="utf-8") as f:
for line in f:
if "NOBLINK" in line or "cmd_list" in line:
print(line.rstrip())