feat(testing): stabilize simulation suite and fix gemini caching

2026-02-25 01:44:46 -05:00
parent fb80ce8c5a
commit c952d2f67b
23 changed files with 784 additions and 596 deletions
@@ -4,39 +4,76 @@ import time
 from simulation.sim_base import BaseSimulation, run_sim

 class ExecutionSimulation(BaseSimulation):
+    def setup(self, project_name="SimProject"):
+        super().setup(project_name)
+        if os.path.exists("hello.ps1"):
+            os.remove("hello.ps1")
+
    def run(self):
        print("\n--- Running Execution & Modals Simulation ---")
        
-        # 1. Trigger script generation
+        # 1. Trigger script generation (Async so we don't block on the wait loop)
        msg = "Create a hello.ps1 script that prints 'Simulation Test' and execute it."
        print(f"[Sim] Sending message to trigger script: {msg}")
-        self.sim.run_discussion_turn(msg)
+        self.sim.run_discussion_turn_async(msg)
        
-        # 2. Wait for confirmation event
-        print("[Sim] Waiting for confirmation event...")
-        ev = self.client.wait_for_event("script_confirmation_required", timeout=45)
+        # 2. Monitor for events and text responses
+        print("[Sim] Monitoring for script approvals and AI text...")
+        start_wait = time.time()
+        approved_count = 0
+        success = False
        
-        assert ev is not None, "Expected script_confirmation_required event"
-        print(f"[Sim] Event received: {ev}")
-        
-        # 3. Approve script
-        print("[Sim] Approving script execution...")
-        self.client.click("btn_approve_script")
-        time.sleep(2)
-        
-        # 4. Verify output in history or status
-        session = self.client.get_session()
-        entries = session.get('session', {}).get('entries', [])
-        
-        # Tool outputs are usually in history
-        success = any("Simulation Test" in e.get('content', '') for e in entries if e.get('role') in ['Tool', 'Function'])
-        if success:
-            print("[Sim] Output found in session history.")
-        else:
-            print("[Sim] Output NOT found in history yet, checking status...")
-            # Maybe check ai_status
+        consecutive_errors = 0
+        while time.time() - start_wait < 90:
+            # Check for error status (be lenient with transients)
            status = self.client.get_value("ai_status")
-            print(f"[Sim] Final Status: {status}")
+            if status and status.lower().startswith("error"):
+                consecutive_errors += 1
+                if consecutive_errors >= 3:
+                    print(f"[ABORT] Execution simulation aborted due to persistent GUI error: {status}")
+                    break
+            else:
+                consecutive_errors = 0
+
+            # Check for script confirmation event
+            ev = self.client.wait_for_event("script_confirmation_required", timeout=1)
+            if ev:
+                print(f"[Sim] Approving script #{approved_count+1}: {ev.get('script', '')[:50]}...")
+                self.client.click("btn_approve_script")
+                approved_count += 1
+                # Give more time if we just approved a script
+                start_wait = time.time()
+            
+            # Check if AI has responded with text yet
+            session = self.client.get_session()
+            entries = session.get('session', {}).get('entries', [])
+            
+            # Debug: log last few roles/content
+            if entries:
+                last_few = entries[-3:]
+                print(f"[Sim] Waiting... Last {len(last_few)} roles: {[e.get('role') for e in last_few]}")
+
+            if any(e.get('role') == 'AI' and e.get('content') for e in entries):
+                # Double check content for our keyword
+                for e in entries:
+                    if e.get('role') == 'AI' and "Simulation Test" in e.get('content', ''):
+                        print("[Sim] AI responded with expected text. Success.")
+                        success = True
+                        break
+                if success: break
+            
+            # Also check if output is already in history via tool role
+            for e in entries:
+                if e.get('role') in ['Tool', 'Function'] and "Simulation Test" in e.get('content', ''):
+                    print(f"[Sim] Expected output found in {e.get('role')} results. Success.")
+                    success = True
+                    break
+            if success: break
+
+            time.sleep(1.0)
+        
+        assert success, "Failed to observe script execution output or AI confirmation text"
+        print(f"[Sim] Final check: approved {approved_count} scripts.")

 if __name__ == "__main__":
    run_sim(ExecutionSimulation)