test(sim): Rewrite visual_sim_mma_v2 for real Gemini API with frame-sync fixes

Uses gemini-2.5-flash-lite (real API, CLI quota exhausted). Adds _poll/_drain_approvals helpers, frame-sync sleeps after all state-changing clicks, proper stage transitions, and 120s timeouts for real API latency. Addresses simulation_hardening Issues 2 & 3.
2026-03-01 13:42:34 -05:00
parent 24ed309ac1
commit 89a8d9bcc2
1 changed files with 141 additions and 152 deletions
--- a/tests/visual_sim_mma_v2.py
+++ b/tests/visual_sim_mma_v2.py
@@ -3,180 +3,169 @@ import time
 import sys
 import os
 # Ensure project root is in path
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
 from api_hook_client import ApiHookClient
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 def _drain_approvals(client: ApiHookClient, status: dict) -> None:
    """Auto-approve any pending approval gate found in status."""
    if status.get('pending_mma_spawn_approval'):
        print('[SIM] Approving pending spawn...')
        client.click('btn_approve_spawn')
        time.sleep(0.5)
    elif status.get('pending_mma_step_approval'):
        print('[SIM] Approving pending MMA step...')
        client.click('btn_approve_mma_step')
        time.sleep(0.5)
    elif status.get('pending_tool_approval'):
        print('[SIM] Approving pending tool...')
        client.click('btn_approve_tool')
        time.sleep(0.5)
 def _poll(client: ApiHookClient, timeout: int, condition, label: str) -> tuple[bool, dict]:
    """Poll get_mma_status() until condition(status) is True or timeout."""
    status = {}
    for i in range(timeout):
        status = client.get_mma_status() or {}
        print(f"[SIM][{label}] t={i}s ai_status={status.get('ai_status')} "
              f"mma={status.get('mma_status')} "
              f"streams={list(status.get('mma_streams', {}).keys())}")
        _drain_approvals(client, status)
        if condition(status):
            return True, status
        time.sleep(1)
    return False, status
 # ---------------------------------------------------------------------------
 # Test
 # ---------------------------------------------------------------------------
@pytest.mark.integration
 def test_mma_complete_lifecycle(live_gui) -> None:
    """
-    Tests the entire MMA lifecycle from epic planning to track loading and ticket verification
+    End-to-end MMA lifecycle using real Gemini API (gemini-2.5-flash-lite).
-    in a single test case to avoid state dependency issues between separate test functions.
+    Incorporates frame-sync sleeps and explicit state-transition waits per
    simulation_hardening_20260301 spec (Issues 2 & 3).
    """
    client = ApiHookClient()
-    assert client.wait_for_server(timeout=10)
+    assert client.wait_for_server(timeout=15), "Hook server did not start"
-    # 1. Set up the mock CLI provider
+    # ------------------------------------------------------------------
-    try:
+    # Stage 1: Provider setup
-        client.set_value('current_provider', 'gemini_cli')
+    # ------------------------------------------------------------------
-        # Point the CLI adapter to our mock script
+    client.set_value('current_provider', 'gemini')
-        mock_cli_path = f'{sys.executable} {os.path.abspath("tests/mock_gemini_cli.py")}'
+    time.sleep(0.3)
-        client.set_value('gcli_path', mock_cli_path)
+    client.set_value('current_model', 'gemini-2.5-flash-lite')
-        # Prevent polluting the real project directory with test tracks
+    time.sleep(0.3)
-        client.set_value('files_base_dir', 'tests/artifacts/temp_workspace')
+    client.set_value('files_base_dir', 'tests/artifacts/temp_workspace')
-        client.click('btn_project_save')
+    time.sleep(0.3)
-        time.sleep(1)
+    client.click('btn_project_save')
-    except Exception as e:
+    time.sleep(1.0)   # one full second — let GUI process all set_value tasks
        pytest.fail(f"Failed to set up mock provider: {e}")
-    # 2. Enter epic and click 'Plan Epic'.
+    # ------------------------------------------------------------------
-    client.set_value('mma_epic_input', 'Develop a new feature')
+    # Stage 2: Start epic planning
    # ------------------------------------------------------------------
    # Keep prompt short and simple so the model returns minimal JSON
    client.set_value('mma_epic_input',
                     'Add a hello_world() function to utils.py')
    time.sleep(0.3)
    client.click('btn_mma_plan_epic')
    time.sleep(0.5)   # frame-sync after click
-    # 3. Wait for 'proposed_tracks'.
+    # ------------------------------------------------------------------
-    proposed_tracks_found = False
+    # Stage 3: Wait for proposed_tracks to appear (Tier 1 call)
-    for _ in range(60): # Poll for up to 60 seconds
+    # ------------------------------------------------------------------
-        status = client.get_mma_status()
+    ok, status = _poll(client, timeout=120, label="wait-proposed-tracks",
-        print(f"Polling status: {status}")
+                       condition=lambda s: bool(s.get('proposed_tracks')))
-        print(f"Polling ai_status: {status.get('ai_status', 'N/A')}")
+    assert ok, (
-        if status and status.get('pending_mma_spawn_approval') is True:
+        f"No proposed_tracks after 120s. "
-            print('[SIM] Worker spawn required. Clicking btn_approve_spawn...')
+        f"ai_status={status.get('ai_status')} "
-            client.click('btn_approve_spawn')
+        f"mma_streams={list(status.get('mma_streams', {}).keys())}"
-        elif status and status.get('pending_mma_step_approval') is True:
+    )
-            print('[SIM] MMA step approval required. Clicking btn_approve_mma_step...')
+    n_proposed = len(status['proposed_tracks'])
-            client.click('btn_approve_mma_step')
+    print(f"[SIM] Got {n_proposed} proposed track(s): "
-        elif status and status.get('pending_tool_approval') is True:
+          f"{[t.get('title', t.get('id')) for t in status['proposed_tracks']]}")
            print('[SIM] Tool approval required. Clicking btn_approve_tool...')
            client.click('btn_approve_tool')
        if status and status.get('proposed_tracks') and len(status['proposed_tracks']) > 0:
            proposed_tracks_found = True
            break
        time.sleep(1)
    assert proposed_tracks_found, "Failed to find proposed tracks after planning epic."
-    # 4. Click 'Accept' to start tracks.
+    # ------------------------------------------------------------------
    # Stage 4: Accept tracks (triggers Tier 2 calls + engine.run)
    # ------------------------------------------------------------------
    client.click('btn_mma_accept_tracks')
-    time.sleep(2)
+    time.sleep(1.5)   # frame-sync: let _cb_accept_tracks run one frame + bg thread start
-    # 5. Wait for 'tracks' list to populate with our mock tracks.
+    # ------------------------------------------------------------------
-    tracks_populated = False
+    # Stage 5: Wait for tracks to be written to filesystem + refreshed
-    for _ in range(30): # Poll for up to 30 seconds
+    # ------------------------------------------------------------------
-        status = client.get_mma_status()
+    ok, status = _poll(client, timeout=90, label="wait-tracks-populated",
-        if status and status.get('pending_mma_spawn_approval') is True:
+                       condition=lambda s: bool(s.get('tracks')))
-            client.click('btn_approve_spawn')
+    assert ok, (
-        elif status and status.get('pending_mma_step_approval') is True:
+        f"No tracks appeared after 90s. "
-            client.click('btn_approve_mma_step')
+        f"ai_status={status.get('ai_status')}"
-        elif status and status.get('pending_tool_approval') is True:
+    )
-            client.click('btn_approve_tool')
+    tracks_list = status['tracks']
-        
+    print(f"[SIM] Tracks in project: {[t.get('title', t.get('id')) for t in tracks_list]}")
        tracks = status.get('tracks', [])
        if any('Mock Goal 1' in t.get('title', '') for t in tracks):
            tracks_populated = True
            break
        time.sleep(1)
    assert tracks_populated, "Failed to find 'Mock Goal 1' in tracks list after acceptance."
-    # 6. Verify that one of the new tracks can be loaded and its tickets appear in 'active_tickets'.
+    # ------------------------------------------------------------------
-    status_after_tracks = client.get_mma_status()
+    # Stage 6: Load first track, verify active_tickets populate
-    assert status_after_tracks is not None, "Failed to get MMA status after tracks populated."
+    # ------------------------------------------------------------------
-    tracks_list = status_after_tracks.get('tracks')
+    track_id = tracks_list[0]['id']
-    assert tracks_list is not None and len(tracks_list) > 0, "Tracks list is empty or not found."
+    print(f"[SIM] Loading track: {track_id}")
    client.click('btn_mma_load_track', user_data=track_id)
    time.sleep(1.0)   # frame-sync after load click
-    track_id_to_load = None
+    def _track_loaded(s):
-    for track in tracks_list:
+        at = s.get('active_track')
-        if 'Mock Goal 1' in track.get('title', ''):
+        at_id = at.get('id') if isinstance(at, dict) else at
-            track_id_to_load = track['id']
+        return at_id == track_id and bool(s.get('active_tickets'))
            break
    assert track_id_to_load is not None, "Could not find a track with 'Mock Goal 1' in its title."
    print(f"Attempting to load track with ID: {track_id_to_load}")
-    # Load the first track
+    ok, status = _poll(client, timeout=60, label="wait-track-loaded",
-    client.click('btn_mma_load_track', user_data=track_id_to_load)
+                       condition=_track_loaded)
    assert ok, (
        f"Track {track_id} did not load with tickets after 60s. "
        f"active_track={status.get('active_track')}"
    )
    print(f"[SIM] Track loaded with {len(status.get('active_tickets', []))} ticket(s).")
-    # Poll until 'active_track' is not None and 'active_tickets' are present
+    # ------------------------------------------------------------------
-    active_track_and_tickets_found = False
+    # Stage 7: Wait for engine to reach running/done
-    for _ in range(60): # Poll for up to 60 seconds
+    # ------------------------------------------------------------------
-        status = client.get_mma_status()
+    def _mma_active(s):
-        print(f"Polling load status: {status}")
+        return s.get('mma_status') in ('running', 'done')
        if status and status.get('pending_mma_spawn_approval') is True:
            print('[SIM] Worker spawn required. Clicking btn_approve_spawn...')
            client.click('btn_approve_spawn')
        elif status and status.get('pending_mma_step_approval') is True:
            print('[SIM] MMA step approval required. Clicking btn_approve_mma_step...')
            client.click('btn_approve_mma_step')
        elif status and status.get('pending_tool_approval') is True:
            print('[SIM] Tool approval required. Clicking btn_approve_tool...')
            client.click('btn_approve_tool')
-        # Updated condition to correctly check active_track ID or value
+    ok, status = _poll(client, timeout=120, label="wait-mma-running",
-        active_track = status.get('active_track')
+                       condition=_mma_active)
-        if status and ( (isinstance(active_track, dict) and active_track.get('id') == track_id_to_load) or (active_track == track_id_to_load) ) and \
+    assert ok, (
-           'active_tickets' in status and len(status['active_tickets']) > 0:
+        f"MMA never reached running/done after 120s. "
-            active_track_and_tickets_found = True
+        f"mma_status={status.get('mma_status')}"
-            break
+    )
-        time.sleep(1)
+    print(f"[SIM] MMA status: {status.get('mma_status')}")
    assert active_track_and_tickets_found, f"Timed out waiting for track {track_id_to_load} to load and populate active tickets."
-    print(f"Successfully loaded and verified track ID: {track_id_to_load} with active tickets.")
+    # ------------------------------------------------------------------
    # Stage 8: Verify Tier 3 output appears in mma_streams
    # ------------------------------------------------------------------
    def _tier3_in_streams(s):
        streams = s.get('mma_streams', {})
        tier3_keys = [k for k in streams if 'Tier 3' in k]
        if not tier3_keys:
            return False
        return bool(streams[tier3_keys[0]].strip())
-    # 7. Poll for MMA status 'running' or 'done' (already started by Accept Tracks).
+    ok, status = _poll(client, timeout=120, label="wait-tier3-streams",
-    mma_running = False
+                       condition=_tier3_in_streams)
    for _ in range(120): # Poll for up to 120 seconds
        status = client.get_mma_status()
        print(f"Polling MMA status for 'running': {status.get('mma_status')}")
-        # Handle pending states during the run
+    streams = status.get('mma_streams', {})
-        if status and status.get('pending_mma_spawn_approval') is True:
+    tier3_keys = [k for k in streams if 'Tier 3' in k]
-            print('[SIM] Worker spawn required. Clicking btn_approve_spawn...')
+    assert ok, (
-            client.click('btn_approve_spawn')
+        f"No non-empty Tier 3 output in mma_streams after 120s. "
-        elif status and status.get('pending_mma_step_approval') is True:
+        f"streams keys={list(streams.keys())} "
-            print('[SIM] MMA step approval required. Clicking btn_approve_mma_step...')
+        f"mma_status={status.get('mma_status')}"
-            client.click('btn_approve_mma_step')
+    )
        elif status and status.get('pending_tool_approval') is True:
            print('[SIM] Tool approval required. Clicking btn_approve_tool...')
            client.click('btn_approve_tool')
-        # Check if MMA is running
+    tier3_content = streams[tier3_keys[0]]
-        if status and status.get('mma_status') == 'running':
+    print(f"[SIM] Tier 3 output ({len(tier3_content)} chars): {tier3_content[:100]}...")
-            mma_running = True
+    print("[SIM] MMA complete lifecycle simulation PASSED.")
            break
        # Also check if it's already finished or error
        if status and status.get('mma_status') in ['done', 'error']:
            break
        time.sleep(1)
    assert mma_running or (status and status.get('mma_status') == 'done'), f"Timed out waiting for MMA status to become 'running' for track {track_id_to_load}."
    print(f"MMA status is: {status.get('mma_status')}")
    # 8. Verify 'active_tier' change and output in 'mma_streams'.
    streams_found = False
    for _ in range(60): # Give it more time for the worker to spawn and respond
        status = client.get_mma_status()
        # Handle approvals if they pop up during worker execution
        if status and status.get('pending_mma_spawn_approval') is True:
            print('[SIM] Worker spawn required. Clicking btn_approve_spawn...')
            client.click('btn_approve_spawn')
        elif status and status.get('pending_mma_step_approval') is True:
            print('[SIM] MMA step approval required. Clicking btn_approve_mma_step...')
            client.click('btn_approve_mma_step')
        elif status and status.get('pending_tool_approval') is True:
            print('[SIM] Tool approval required. Clicking btn_approve_tool...')
            client.click('btn_approve_tool')
        streams = status.get('mma_streams', {})
        print(f"Polling streams: {list(streams.keys())}")
        if streams and any("Tier 3" in k for k in streams.keys()):
            print(f"[SIM] Found Tier 3 worker output in streams: {list(streams.keys())}")
            # Check for our specific mock content
            tier3_key = [k for k in streams.keys() if "Tier 3" in k][0]
            if "SUCCESS: Mock Tier 3 worker" in streams[tier3_key]:
                print("[SIM] Verified mock worker output content.")
                streams_found = True
                break
        time.sleep(1)
    assert streams_found, "No Tier 3 mock output found in 'mma_streams'."
    print("MMA complete lifecycle simulation successful.")