From 89a8d9bcc268bd0fb9ef9c1b081c06c63c225a52 Mon Sep 17 00:00:00 2001 From: Ed_ Date: Sun, 1 Mar 2026 13:42:34 -0500 Subject: [PATCH] test(sim): Rewrite visual_sim_mma_v2 for real Gemini API with frame-sync fixes Uses gemini-2.5-flash-lite (real API, CLI quota exhausted). Adds _poll/_drain_approvals helpers, frame-sync sleeps after all state-changing clicks, proper stage transitions, and 120s timeouts for real API latency. Addresses simulation_hardening Issues 2 & 3. --- tests/visual_sim_mma_v2.py | 293 ++++++++++++++++++------------------- 1 file changed, 141 insertions(+), 152 deletions(-) diff --git a/tests/visual_sim_mma_v2.py b/tests/visual_sim_mma_v2.py index 4077f26..644de77 100644 --- a/tests/visual_sim_mma_v2.py +++ b/tests/visual_sim_mma_v2.py @@ -3,180 +3,169 @@ import time import sys import os -# Ensure project root is in path sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) from api_hook_client import ApiHookClient +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _drain_approvals(client: ApiHookClient, status: dict) -> None: + """Auto-approve any pending approval gate found in status.""" + if status.get('pending_mma_spawn_approval'): + print('[SIM] Approving pending spawn...') + client.click('btn_approve_spawn') + time.sleep(0.5) + elif status.get('pending_mma_step_approval'): + print('[SIM] Approving pending MMA step...') + client.click('btn_approve_mma_step') + time.sleep(0.5) + elif status.get('pending_tool_approval'): + print('[SIM] Approving pending tool...') + client.click('btn_approve_tool') + time.sleep(0.5) + + +def _poll(client: ApiHookClient, timeout: int, condition, label: str) -> tuple[bool, dict]: + """Poll get_mma_status() until condition(status) is True or timeout.""" + status = {} + for i in range(timeout): + status = client.get_mma_status() or {} + print(f"[SIM][{label}] t={i}s ai_status={status.get('ai_status')} " + f"mma={status.get('mma_status')} " + f"streams={list(status.get('mma_streams', {}).keys())}") + _drain_approvals(client, status) + if condition(status): + return True, status + time.sleep(1) + return False, status + + +# --------------------------------------------------------------------------- +# Test +# --------------------------------------------------------------------------- + @pytest.mark.integration def test_mma_complete_lifecycle(live_gui) -> None: """ - Tests the entire MMA lifecycle from epic planning to track loading and ticket verification - in a single test case to avoid state dependency issues between separate test functions. + End-to-end MMA lifecycle using real Gemini API (gemini-2.5-flash-lite). + Incorporates frame-sync sleeps and explicit state-transition waits per + simulation_hardening_20260301 spec (Issues 2 & 3). """ client = ApiHookClient() - assert client.wait_for_server(timeout=10) + assert client.wait_for_server(timeout=15), "Hook server did not start" - # 1. Set up the mock CLI provider - try: - client.set_value('current_provider', 'gemini_cli') - # Point the CLI adapter to our mock script - mock_cli_path = f'{sys.executable} {os.path.abspath("tests/mock_gemini_cli.py")}' - client.set_value('gcli_path', mock_cli_path) - # Prevent polluting the real project directory with test tracks - client.set_value('files_base_dir', 'tests/artifacts/temp_workspace') - client.click('btn_project_save') - time.sleep(1) - except Exception as e: - pytest.fail(f"Failed to set up mock provider: {e}") + # ------------------------------------------------------------------ + # Stage 1: Provider setup + # ------------------------------------------------------------------ + client.set_value('current_provider', 'gemini') + time.sleep(0.3) + client.set_value('current_model', 'gemini-2.5-flash-lite') + time.sleep(0.3) + client.set_value('files_base_dir', 'tests/artifacts/temp_workspace') + time.sleep(0.3) + client.click('btn_project_save') + time.sleep(1.0) # one full second — let GUI process all set_value tasks - # 2. Enter epic and click 'Plan Epic'. - client.set_value('mma_epic_input', 'Develop a new feature') + # ------------------------------------------------------------------ + # Stage 2: Start epic planning + # ------------------------------------------------------------------ + # Keep prompt short and simple so the model returns minimal JSON + client.set_value('mma_epic_input', + 'Add a hello_world() function to utils.py') + time.sleep(0.3) client.click('btn_mma_plan_epic') + time.sleep(0.5) # frame-sync after click - # 3. Wait for 'proposed_tracks'. - proposed_tracks_found = False - for _ in range(60): # Poll for up to 60 seconds - status = client.get_mma_status() - print(f"Polling status: {status}") - print(f"Polling ai_status: {status.get('ai_status', 'N/A')}") - if status and status.get('pending_mma_spawn_approval') is True: - print('[SIM] Worker spawn required. Clicking btn_approve_spawn...') - client.click('btn_approve_spawn') - elif status and status.get('pending_mma_step_approval') is True: - print('[SIM] MMA step approval required. Clicking btn_approve_mma_step...') - client.click('btn_approve_mma_step') - elif status and status.get('pending_tool_approval') is True: - print('[SIM] Tool approval required. Clicking btn_approve_tool...') - client.click('btn_approve_tool') - if status and status.get('proposed_tracks') and len(status['proposed_tracks']) > 0: - proposed_tracks_found = True - break - time.sleep(1) - assert proposed_tracks_found, "Failed to find proposed tracks after planning epic." + # ------------------------------------------------------------------ + # Stage 3: Wait for proposed_tracks to appear (Tier 1 call) + # ------------------------------------------------------------------ + ok, status = _poll(client, timeout=120, label="wait-proposed-tracks", + condition=lambda s: bool(s.get('proposed_tracks'))) + assert ok, ( + f"No proposed_tracks after 120s. " + f"ai_status={status.get('ai_status')} " + f"mma_streams={list(status.get('mma_streams', {}).keys())}" + ) + n_proposed = len(status['proposed_tracks']) + print(f"[SIM] Got {n_proposed} proposed track(s): " + f"{[t.get('title', t.get('id')) for t in status['proposed_tracks']]}") - # 4. Click 'Accept' to start tracks. + # ------------------------------------------------------------------ + # Stage 4: Accept tracks (triggers Tier 2 calls + engine.run) + # ------------------------------------------------------------------ client.click('btn_mma_accept_tracks') - time.sleep(2) + time.sleep(1.5) # frame-sync: let _cb_accept_tracks run one frame + bg thread start - # 5. Wait for 'tracks' list to populate with our mock tracks. - tracks_populated = False - for _ in range(30): # Poll for up to 30 seconds - status = client.get_mma_status() - if status and status.get('pending_mma_spawn_approval') is True: - client.click('btn_approve_spawn') - elif status and status.get('pending_mma_step_approval') is True: - client.click('btn_approve_mma_step') - elif status and status.get('pending_tool_approval') is True: - client.click('btn_approve_tool') - - tracks = status.get('tracks', []) - if any('Mock Goal 1' in t.get('title', '') for t in tracks): - tracks_populated = True - break - time.sleep(1) - assert tracks_populated, "Failed to find 'Mock Goal 1' in tracks list after acceptance." + # ------------------------------------------------------------------ + # Stage 5: Wait for tracks to be written to filesystem + refreshed + # ------------------------------------------------------------------ + ok, status = _poll(client, timeout=90, label="wait-tracks-populated", + condition=lambda s: bool(s.get('tracks'))) + assert ok, ( + f"No tracks appeared after 90s. " + f"ai_status={status.get('ai_status')}" + ) + tracks_list = status['tracks'] + print(f"[SIM] Tracks in project: {[t.get('title', t.get('id')) for t in tracks_list]}") - # 6. Verify that one of the new tracks can be loaded and its tickets appear in 'active_tickets'. - status_after_tracks = client.get_mma_status() - assert status_after_tracks is not None, "Failed to get MMA status after tracks populated." - tracks_list = status_after_tracks.get('tracks') - assert tracks_list is not None and len(tracks_list) > 0, "Tracks list is empty or not found." + # ------------------------------------------------------------------ + # Stage 6: Load first track, verify active_tickets populate + # ------------------------------------------------------------------ + track_id = tracks_list[0]['id'] + print(f"[SIM] Loading track: {track_id}") + client.click('btn_mma_load_track', user_data=track_id) + time.sleep(1.0) # frame-sync after load click - track_id_to_load = None - for track in tracks_list: - if 'Mock Goal 1' in track.get('title', ''): - track_id_to_load = track['id'] - break - assert track_id_to_load is not None, "Could not find a track with 'Mock Goal 1' in its title." - print(f"Attempting to load track with ID: {track_id_to_load}") + def _track_loaded(s): + at = s.get('active_track') + at_id = at.get('id') if isinstance(at, dict) else at + return at_id == track_id and bool(s.get('active_tickets')) - # Load the first track - client.click('btn_mma_load_track', user_data=track_id_to_load) + ok, status = _poll(client, timeout=60, label="wait-track-loaded", + condition=_track_loaded) + assert ok, ( + f"Track {track_id} did not load with tickets after 60s. " + f"active_track={status.get('active_track')}" + ) + print(f"[SIM] Track loaded with {len(status.get('active_tickets', []))} ticket(s).") - # Poll until 'active_track' is not None and 'active_tickets' are present - active_track_and_tickets_found = False - for _ in range(60): # Poll for up to 60 seconds - status = client.get_mma_status() - print(f"Polling load status: {status}") - if status and status.get('pending_mma_spawn_approval') is True: - print('[SIM] Worker spawn required. Clicking btn_approve_spawn...') - client.click('btn_approve_spawn') - elif status and status.get('pending_mma_step_approval') is True: - print('[SIM] MMA step approval required. Clicking btn_approve_mma_step...') - client.click('btn_approve_mma_step') - elif status and status.get('pending_tool_approval') is True: - print('[SIM] Tool approval required. Clicking btn_approve_tool...') - client.click('btn_approve_tool') + # ------------------------------------------------------------------ + # Stage 7: Wait for engine to reach running/done + # ------------------------------------------------------------------ + def _mma_active(s): + return s.get('mma_status') in ('running', 'done') - # Updated condition to correctly check active_track ID or value - active_track = status.get('active_track') - if status and ( (isinstance(active_track, dict) and active_track.get('id') == track_id_to_load) or (active_track == track_id_to_load) ) and \ - 'active_tickets' in status and len(status['active_tickets']) > 0: - active_track_and_tickets_found = True - break - time.sleep(1) - assert active_track_and_tickets_found, f"Timed out waiting for track {track_id_to_load} to load and populate active tickets." + ok, status = _poll(client, timeout=120, label="wait-mma-running", + condition=_mma_active) + assert ok, ( + f"MMA never reached running/done after 120s. " + f"mma_status={status.get('mma_status')}" + ) + print(f"[SIM] MMA status: {status.get('mma_status')}") - print(f"Successfully loaded and verified track ID: {track_id_to_load} with active tickets.") + # ------------------------------------------------------------------ + # Stage 8: Verify Tier 3 output appears in mma_streams + # ------------------------------------------------------------------ + def _tier3_in_streams(s): + streams = s.get('mma_streams', {}) + tier3_keys = [k for k in streams if 'Tier 3' in k] + if not tier3_keys: + return False + return bool(streams[tier3_keys[0]].strip()) - # 7. Poll for MMA status 'running' or 'done' (already started by Accept Tracks). - mma_running = False - for _ in range(120): # Poll for up to 120 seconds - status = client.get_mma_status() - print(f"Polling MMA status for 'running': {status.get('mma_status')}") + ok, status = _poll(client, timeout=120, label="wait-tier3-streams", + condition=_tier3_in_streams) - # Handle pending states during the run - if status and status.get('pending_mma_spawn_approval') is True: - print('[SIM] Worker spawn required. Clicking btn_approve_spawn...') - client.click('btn_approve_spawn') - elif status and status.get('pending_mma_step_approval') is True: - print('[SIM] MMA step approval required. Clicking btn_approve_mma_step...') - client.click('btn_approve_mma_step') - elif status and status.get('pending_tool_approval') is True: - print('[SIM] Tool approval required. Clicking btn_approve_tool...') - client.click('btn_approve_tool') + streams = status.get('mma_streams', {}) + tier3_keys = [k for k in streams if 'Tier 3' in k] + assert ok, ( + f"No non-empty Tier 3 output in mma_streams after 120s. " + f"streams keys={list(streams.keys())} " + f"mma_status={status.get('mma_status')}" + ) - # Check if MMA is running - if status and status.get('mma_status') == 'running': - mma_running = True - break - # Also check if it's already finished or error - if status and status.get('mma_status') in ['done', 'error']: - break - time.sleep(1) - assert mma_running or (status and status.get('mma_status') == 'done'), f"Timed out waiting for MMA status to become 'running' for track {track_id_to_load}." - - print(f"MMA status is: {status.get('mma_status')}") - # 8. Verify 'active_tier' change and output in 'mma_streams'. - streams_found = False - for _ in range(60): # Give it more time for the worker to spawn and respond - status = client.get_mma_status() - - # Handle approvals if they pop up during worker execution - if status and status.get('pending_mma_spawn_approval') is True: - print('[SIM] Worker spawn required. Clicking btn_approve_spawn...') - client.click('btn_approve_spawn') - elif status and status.get('pending_mma_step_approval') is True: - print('[SIM] MMA step approval required. Clicking btn_approve_mma_step...') - client.click('btn_approve_mma_step') - elif status and status.get('pending_tool_approval') is True: - print('[SIM] Tool approval required. Clicking btn_approve_tool...') - client.click('btn_approve_tool') - - streams = status.get('mma_streams', {}) - print(f"Polling streams: {list(streams.keys())}") - - if streams and any("Tier 3" in k for k in streams.keys()): - print(f"[SIM] Found Tier 3 worker output in streams: {list(streams.keys())}") - # Check for our specific mock content - tier3_key = [k for k in streams.keys() if "Tier 3" in k][0] - if "SUCCESS: Mock Tier 3 worker" in streams[tier3_key]: - print("[SIM] Verified mock worker output content.") - streams_found = True - break - - time.sleep(1) - - assert streams_found, "No Tier 3 mock output found in 'mma_streams'." - print("MMA complete lifecycle simulation successful.") + tier3_content = streams[tier3_keys[0]] + print(f"[SIM] Tier 3 output ({len(tier3_content)} chars): {tier3_content[:100]}...") + print("[SIM] MMA complete lifecycle simulation PASSED.")