test(infra): poll-for-event race fixes + watchdog timeout bump + spec update

2026-06-10 15:14:35 -04:00
parent 563e609505
commit 2c924fe6df
4 changed files with 55 additions and 13 deletions
@@ -4,9 +4,11 @@
 This track fixes a pre-existing RAG test failure that halted the `tier-3-live_gui` batch during the `mma_tier_usage_reset_fix_20260610` verification run on 2026-06-10.
-**The bug:** `tests/test_rag_phase4_final_verify.py::test_phase4_final_verify` fails because `rag_status` stays at `'idle'` after the test sets `rag_enabled=True`, `rag_source='chroma'`, `rag_emb_provider='local'` via the Hook API. The test polls for `rag_status == 'ready'` for 50 seconds (100 × 0.5s) and never sees it.
+**The original bug (FIXED):** `tests/test_rag_phase4_final_verify.py::test_phase4_final_verify` failed with "RAG sync failed. Status: idle" because `_handle_reset_session` set `self.rag_config = None` and the `rag_*` setters check `if self.rag_config:` before doing anything — so the 4 setters fired by the test were all no-ops.
-The test was failing before any changes from the `mma_tier_usage_reset_fix_20260610` track. It is a pre-existing fragility in the RAG sync flow that the previous track's batch run exposed.
+**Fix:** reset `rag_config` to a fresh `RAGConfig()` default (not None) in `_handle_reset_session`, so the setters can mutate it and trigger the sync.
 **Status (post-fix):** RAG sync now reaches `'ready'`; the test fails on a SEPARATE downstream assertion (retrieval order — see "Residual issue" below).
 ## Reproduction (already verified)
@@ -181,7 +181,7 @@ def _check_required_test_dependencies() -> None:
  raise pytest.UsageError(msg)
 def _smart_watchdog_exit() -> None:
- if not _pytest_finished_event.wait(timeout=600.0):
+ if not _pytest_finished_event.wait(timeout=900.0):
  os._exit(2)
 import time
 time.sleep(5.0)
@@ -26,10 +26,27 @@ def test_reset_session_clears_mma_tier_usage(live_gui) -> None:
  'tier_usage': {'Tier 1': {'model': 'polluted'}},
  'tickets': []
 })
- time.sleep(0.5)
+ # Poll until the polluted entry is visible. Without this, the reset
 # can fire BEFORE the push_event task is processed (async via io_pool
 # + GUI render loop), and the test would falsely pass even if the
 # reset didn't actually clear anything.
 for _ in range(40):
  state = client.get_gui_state()
  mma = state.get('mma_state', {})
  tier1 = mma.get('tier_usage', {}).get('Tier 1', {})
  if tier1.get('model') == 'polluted':
   break
  time.sleep(0.25)
 # Trigger the reset
 client.reset_session()
- time.sleep(0.5)
+ # Poll until the polluted entry is gone
 for _ in range(40):
  state = client.get_gui_state()
  mma = state.get('mma_state', {})
  tier1 = mma.get('tier_usage', {}).get('Tier 1', {})
  if tier1.get('model') != 'polluted':
   break
  time.sleep(0.25)
 # Verify the polluted entry is gone
 state = client.get_gui_state()
 mma = state.get('mma_state', {})
@@ -50,10 +67,21 @@ def test_reset_session_clears_mma_status(live_gui) -> None:
  'tier_usage': {},
  'tickets': []
 })
- time.sleep(0.5)
+ # Poll for the polluted status to be visible BEFORE the reset
 for _ in range(40):
  state = client.get_gui_state()
  if state.get('mma_status') == 'running':
   break
  time.sleep(0.25)
 client.reset_session()
- time.sleep(0.5)
+ # Poll for the reset to have taken effect. Without this, the
- state = client.get_gui_state()
+ # mma_state_update task can fire AFTER the reset, setting status
 # back to 'running' (race condition surfaced in batched live_gui).
 for _ in range(40):
  state = client.get_gui_state()
  if state.get('mma_status') == 'idle':
   break
  time.sleep(0.25)
 assert state.get('mma_status') == 'idle', (
  f"mma_status not reset: {state.get('mma_status')!r}"
 )
@@ -70,10 +98,19 @@ def test_reset_session_clears_active_tier(live_gui) -> None:
  'tier_usage': {},
  'tickets': []
 })
- time.sleep(0.5)
+ # Poll for the polluted active_tier to be visible BEFORE the reset
 for _ in range(40):
  state = client.get_gui_state()
  if state.get('active_tier') == 'Tier 2 (Tech Lead)':
   break
  time.sleep(0.25)
 client.reset_session()
- time.sleep(0.5)
+ # Poll for the reset to have taken effect
- state = client.get_gui_state()
+ for _ in range(40):
  state = client.get_gui_state()
  if state.get('active_tier') is None:
   break
  time.sleep(0.25)
 assert state.get('active_tier') is None, (
  f"active_tier not reset: {state.get('active_tier')!r}"
 )
@@ -112,10 +112,13 @@ def test_mock_timeout(live_gui) -> None:
  client.set_value("ai_input", "Trigger timeout")
  client.click("btn_gen_send")
-  # Wait for terminal response
+  # Wait for terminal response. The mock subprocess sleeps for 65s
  # then exits; allow 180s for the event to land (the io_pool is busy
  # in batched live_gui context, and the event propagation through
  # _pending_gui_tasks can be slow under contention).
  event = None
  start = time.time()
-  while time.time() - start < 80:
+  while time.time() - start < 180:
   ev = client.wait_for_event("response", timeout=5)
   if ev and ev.get("payload", {}).get("status") != "streaming...":
    event = ev