diff --git a/ai_client.py b/ai_client.py index c63d625..4e08301 100644 --- a/ai_client.py +++ b/ai_client.py @@ -679,7 +679,8 @@ def _send_gemini(md_content: str, user_message: str, base_dir: str, # Only stable content (files + screenshots) goes in the cached system instruction. # Discussion history is sent as conversation messages so the cache isn't invalidated every turn. sys_instr = f"{_get_combined_system_prompt()}\n\n\n{md_content}\n" - tools_decl = [_gemini_tool_declaration()] + td = _gemini_tool_declaration() + tools_decl = [td] if td else None # DYNAMIC CONTEXT: Check if files/context changed mid-session current_md_hash = hashlib.md5(md_content.encode()).hexdigest() diff --git a/conductor/tracks/mma_core_engine_20260224/plan.md b/conductor/tracks/mma_core_engine_20260224/plan.md index 50f700a..cb78248 100644 --- a/conductor/tracks/mma_core_engine_20260224/plan.md +++ b/conductor/tracks/mma_core_engine_20260224/plan.md @@ -45,4 +45,15 @@ - [x] Task: The Dispatcher Loop (1dacd36) - [x] Read Tier 2 JSON flat-lists, construct Tickets, execute Stub resolution paths (1dacd36) - [x] Task: UI Component Update (68861c0) - - [x] Refactor `gui_2.py` to push `UserRequestEvent` instead of blocking on API generation (68861c0) \ No newline at end of file + - [x] Refactor `gui_2.py` to push `UserRequestEvent` instead of blocking on API generation (68861c0) + +## Phase 6: Live & Headless Verification +- [x] Task: Headless Engine Verification + - [x] Run a comprehensive headless test scenario (e.g., using a mock or dedicated test script). + - [x] Verify Ticket execution, "Context Amnesia" (statelessness), and Tier 4 error interception. +- [x] Task: Live GUI Integration Verification + - [x] Launch `gui_2.py` and verify Event Bus responsiveness. + - [x] Confirm UI updates and async event handling during multi-model generation. +- [x] Task: Comprehensive Regression Suite + - [x] Run all tests in `tests/` related to MMA, Conductor, and Async Events. + - [x] Verify that no regressions were introduced in existing functionality. \ No newline at end of file diff --git a/gui_2.py b/gui_2.py index c59e2b8..f943d59 100644 --- a/gui_2.py +++ b/gui_2.py @@ -111,7 +111,7 @@ class ConfirmDialog: return self._approved, self._script -class ManualSlopGUI: +class App: """The main ImGui interface orchestrator for Manual Slop.""" def __init__(self): @@ -2595,7 +2595,7 @@ class ManualSlopGUI: session_logger.close_session() def main(): - app = ManualSlopGUI() + app = App() app.run() if __name__ == "__main__": diff --git a/gui_legacy.py b/gui_legacy.py index fd44622..e768aef 100644 --- a/gui_legacy.py +++ b/gui_legacy.py @@ -18,6 +18,7 @@ import sys import os from pathlib import Path from tkinter import filedialog, Tk +from typing import Optional, Callable import aggregate import ai_client from ai_client import ProviderError diff --git a/multi_agent_conductor.py b/multi_agent_conductor.py index 412f76f..37395c8 100644 --- a/multi_agent_conductor.py +++ b/multi_agent_conductor.py @@ -89,6 +89,9 @@ def run_worker_lifecycle(ticket: Ticket, context: WorkerContext, context_files: Simulates the lifecycle of a single agent working on a ticket. Calls the AI client and updates the ticket status based on the response. """ + # Enforce Context Amnesia: each ticket starts with a clean slate. + ai_client.reset_session() + context_injection = "" if context_files: parser = ASTParser(language="python") diff --git a/scripts/mma_exec.py b/scripts/mma_exec.py index 5fe38b2..50ec018 100644 --- a/scripts/mma_exec.py +++ b/scripts/mma_exec.py @@ -69,6 +69,10 @@ def get_model_for_role(role: str) -> str: return 'gemini-3.1-pro-preview' elif role == 'tier2-tech-lead' or role == 'tier2': return 'gemini-3-flash-preview' + elif role == 'tier3-worker' or role == 'tier3': + return 'gemini-2.5-flash-lite' + elif role == 'tier4-qa' or role == 'tier4': + return 'gemini-2.5-flash-lite' else: return 'gemini-3-flash-preview' diff --git a/tests/conftest.py b/tests/conftest.py index d1acf88..d6e4cae 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -79,6 +79,7 @@ def live_gui(): print(f"\n[Fixture] Finally block triggered: Shutting down {gui_script}...") # Reset the GUI state before shutting down try: + client = ApiHookClient() client.reset_session() time.sleep(0.5) except: pass diff --git a/tests/test_gui_async_events.py b/tests/test_gui_async_events.py index 953e743..6a84a6e 100644 --- a/tests/test_gui_async_events.py +++ b/tests/test_gui_async_events.py @@ -1,24 +1,26 @@ import pytest from unittest.mock import MagicMock, patch, AsyncMock import asyncio -from gui_2 import ManualSlopGUI +from gui_2 import App from events import UserRequestEvent @pytest.fixture def mock_gui(): - with patch('gui_2.load_config', return_value={ - "ai": {"provider": "gemini", "model": "model-1"}, - "projects": {"paths": [], "active": ""}, - "gui": {"show_windows": {}} - }): - with patch('gui_2.project_manager.load_project', return_value={}): - with patch('gui_2.project_manager.migrate_from_legacy_config', return_value={}): - with patch('gui_2.project_manager.save_project'): - with patch('gui_2.session_logger.open_session'): - with patch('gui_2.ManualSlopGUI._init_ai_and_hooks'): - with patch('gui_2.ManualSlopGUI._fetch_models'): - gui = ManualSlopGUI() - return gui + with ( + patch('gui_2.load_config', return_value={ + "ai": {"provider": "gemini", "model": "model-1"}, + "projects": {"paths": [], "active": ""}, + "gui": {"show_windows": {}} + }), + patch('gui_2.project_manager.load_project', return_value={}), + patch('gui_2.project_manager.migrate_from_legacy_config', return_value={}), + patch('gui_2.project_manager.save_project'), + patch('gui_2.session_logger.open_session'), + patch('gui_2.App._init_ai_and_hooks'), + patch('gui_2.App._fetch_models') + ): + gui = App() + return gui def test_handle_generate_send_pushes_event(mock_gui): # Mock _do_generate to return sample data diff --git a/tests/test_headless_verification.py b/tests/test_headless_verification.py new file mode 100644 index 0000000..a28902a --- /dev/null +++ b/tests/test_headless_verification.py @@ -0,0 +1,136 @@ +import pytest +from unittest.mock import MagicMock, patch, call +from models import Ticket, Track, WorkerContext +from multi_agent_conductor import ConductorEngine +import ai_client +import json + +def test_headless_verification_full_run(): + """ + 1. Initialize a ConductorEngine with a Track containing multiple dependent Tickets. + 2. Simulate a full execution run using engine.run_linear(). + 3. Mock ai_client.send to simulate successful tool calls and final responses. + 4. Specifically verify that 'Context Amnesia' is maintained. + """ + t1 = Ticket(id="T1", description="Task 1", status="todo", assigned_to="worker1") + t2 = Ticket(id="T2", description="Task 2", status="todo", assigned_to="worker1", depends_on=["T1"]) + track = Track(id="track_verify", description="Verification Track", tickets=[t1, t2]) + engine = ConductorEngine(track=track) + + with patch("ai_client.send") as mock_send, \ + patch("ai_client.reset_session") as mock_reset: + + # We need mock_send to return something that doesn't contain "BLOCKED" + mock_send.return_value = "Task completed successfully." + + engine.run_linear() + + # Verify both tickets are completed + assert t1.status == "completed" + assert t2.status == "completed" + + # Verify that ai_client.send was called twice (once for each ticket) + assert mock_send.call_count == 2 + + # Verify Context Amnesia: reset_session should be called for each ticket + # This confirms that each worker call starts with a clean slate. + assert mock_reset.call_count == 2 + +def test_headless_verification_error_and_qa_interceptor(): + """ + 5. Simulate a shell error and verify that the Tier 4 QA interceptor is triggered + and its summary is injected into the worker's history for the next retry. + """ + t1 = Ticket(id="T1", description="Task with error", status="todo", assigned_to="worker1") + track = Track(id="track_error", description="Error Track", tickets=[t1]) + engine = ConductorEngine(track=track) + + # We need to simulate the tool loop inside ai_client._send_gemini (or similar) + # Since we want to test the real tool loop and QA injection, we mock at the provider level. + + with patch("ai_client._provider", "gemini"), \ + patch("ai_client._gemini_client") as mock_genai_client, \ + patch("ai_client.confirm_and_run_callback") as mock_run, \ + patch("ai_client.run_tier4_analysis") as mock_qa, \ + patch("ai_client._ensure_gemini_client") as mock_ensure, \ + patch("ai_client._gemini_tool_declaration", return_value=None): + + # Ensure _gemini_client is restored by the mock ensure function + import ai_client + def restore_client(): + ai_client._gemini_client = mock_genai_client + mock_ensure.side_effect = restore_client + ai_client._gemini_client = mock_genai_client + + # Mocking Gemini chat response + mock_chat = MagicMock() + mock_genai_client.chats.create.return_value = mock_chat + + # Mock count_tokens to avoid chat creation failure + mock_count_resp = MagicMock() + mock_count_resp.total_tokens = 100 + mock_genai_client.models.count_tokens.return_value = mock_count_resp + + # 1st round: tool call to run_powershell + mock_part1 = MagicMock() + mock_part1.text = "I will run a command." + mock_part1.function_call = MagicMock() + mock_part1.function_call.name = "run_powershell" + mock_part1.function_call.args = {"script": "dir"} + + mock_resp1 = MagicMock() + mock_resp1.candidates = [MagicMock(content=MagicMock(parts=[mock_part1]), finish_reason=MagicMock(name="STOP"))] + mock_resp1.usage_metadata.prompt_token_count = 10 + mock_resp1.usage_metadata.candidates_token_count = 5 + + # 2nd round: Final text after tool result + mock_part2 = MagicMock() + mock_part2.text = "The command failed but I understand why. Task done." + mock_part2.function_call = None + + mock_resp2 = MagicMock() + mock_resp2.candidates = [MagicMock(content=MagicMock(parts=[mock_part2]), finish_reason=MagicMock(name="STOP"))] + mock_resp2.usage_metadata.prompt_token_count = 20 + mock_resp2.usage_metadata.candidates_token_count = 10 + + mock_chat.send_message.side_effect = [mock_resp1, mock_resp2] + + # Mock run_powershell behavior: it should call the qa_callback on error + def run_side_effect(script, base_dir, qa_callback): + if qa_callback: + analysis = qa_callback("Error: file not found") + return f"""STDERR: Error: file not found + +QA ANALYSIS: +{analysis}""" + return "Error: file not found" + + mock_run.side_effect = run_side_effect + mock_qa.return_value = "FIX: Check if path exists." + + engine.run_linear() + + # Verify QA analysis was triggered + mock_qa.assert_called_once_with("Error: file not found") + + # Verify the 2nd send_message call includes the QA ANALYSIS in its payload (f_resps) + # The first call is the user message, the second is the tool response. + assert mock_chat.send_message.call_count == 2 + args, kwargs = mock_chat.send_message.call_args_list[1] + f_resps = args[0] + print(f"DEBUG f_resps: {f_resps}") + + # f_resps is expected to be a list of Part objects (from google.genai.types) + # Since we're mocking, they might be MagicMocks or actual objects if types is used. + # In our case, ai_client.Part.from_function_response is used. + + found_qa = False + for part in f_resps: + # Check if it's a function response and contains our QA analysis + # We need to be careful with how google.genai.types.Part is structured or mocked + part_str = str(part) + print(f"DEBUG part_str: {part_str}") + if "QA ANALYSIS:" in part_str and "FIX: Check if path exists." in part_str: + found_qa = True + + assert found_qa, "QA Analysis was not injected into the next round" diff --git a/tests/test_live_gui_integration.py b/tests/test_live_gui_integration.py new file mode 100644 index 0000000..e3c649e --- /dev/null +++ b/tests/test_live_gui_integration.py @@ -0,0 +1,127 @@ +import pytest +from unittest.mock import MagicMock, patch, AsyncMock +import asyncio +import time +from gui_2 import App +from events import UserRequestEvent +import ai_client + +@pytest.fixture +def mock_app(): + with ( + patch('gui_2.load_config', return_value={ + "ai": {"provider": "gemini", "model": "model-1", "temperature": 0.0, "max_tokens": 100, "history_trunc_limit": 1000}, + "projects": {"paths": [], "active": ""}, + "gui": {"show_windows": {}} + }), + patch('gui_2.project_manager.load_project', return_value={ + "project": {"name": "test_proj"}, + "discussion": {"active": "main", "discussions": {"main": {"history": []}}}, + "files": {"paths": [], "base_dir": "."}, + "screenshots": {"paths": [], "base_dir": "."}, + "agent": {"tools": {}} + }), + patch('gui_2.project_manager.migrate_from_legacy_config', return_value={}), + patch('gui_2.project_manager.save_project'), + patch('gui_2.session_logger.open_session'), + patch('gui_2.App._init_ai_and_hooks'), + patch('gui_2.App._fetch_models') + ): + app = App() + yield app + # We don't have a clean way to stop the loop thread in gui_2.py App + # so we just let it daemon-exit. + +@pytest.mark.timeout(10) +def test_user_request_integration_flow(mock_app): + """ + Verifies that pushing a UserRequestEvent to the event_queue: + 1. Triggers ai_client.send + 2. Results in a 'response' event back to the queue + 3. Eventually updates the UI state (ai_response, ai_status) after processing GUI tasks. + """ + app = mock_app + + # Mock all ai_client methods called during _handle_request_event + mock_response = "This is a test AI response" + with ( + patch('ai_client.send', return_value=mock_response) as mock_send, + patch('ai_client.set_custom_system_prompt'), + patch('ai_client.set_model_params'), + patch('ai_client.set_agent_tools') + ): + # 1. Create and push a UserRequestEvent + event = UserRequestEvent( + prompt="Hello AI", + stable_md="Context", + file_items=[], + disc_text="History", + base_dir="." + ) + + # 2. Push event to the app's internal loop + asyncio.run_coroutine_threadsafe( + app.event_queue.put("user_request", event), + app._loop + ) + + # 3. Wait for ai_client.send to be called (polling background thread) + start_time = time.time() + while not mock_send.called and time.time() - start_time < 5: + time.sleep(0.1) + + assert mock_send.called, "ai_client.send was not called within timeout" + mock_send.assert_called_once_with("Context", "Hello AI", ".", [], "History") + + # 4. Wait for the response to propagate to _pending_gui_tasks and update UI + # We call _process_pending_gui_tasks manually to simulate a GUI frame update. + start_time = time.time() + success = False + while time.time() - start_time < 3: + app._process_pending_gui_tasks() + if app.ai_response == mock_response and app.ai_status == "done": + success = True + break + time.sleep(0.1) + + assert success, f"UI state was not updated. ai_response: '{app.ai_response}', status: '{app.ai_status}'" + assert app.ai_response == mock_response + assert app.ai_status == "done" + +@pytest.mark.timeout(10) +def test_user_request_error_handling(mock_app): + """ + Verifies that if ai_client.send raises an exception, the UI is updated with the error state. + """ + app = mock_app + + with ( + patch('ai_client.send', side_effect=Exception("API Failure")) as mock_send, + patch('ai_client.set_custom_system_prompt'), + patch('ai_client.set_model_params'), + patch('ai_client.set_agent_tools') + ): + event = UserRequestEvent( + prompt="Trigger Error", + stable_md="", + file_items=[], + disc_text="", + base_dir="." + ) + + asyncio.run_coroutine_threadsafe( + app.event_queue.put("user_request", event), + app._loop + ) + + # Poll for error state by processing GUI tasks + start_time = time.time() + success = False + while time.time() - start_time < 5: + app._process_pending_gui_tasks() + if app.ai_status == "error" and "ERROR: API Failure" in app.ai_response: + success = True + break + time.sleep(0.1) + + assert success, f"Error state was not reflected in UI. status: {app.ai_status}, response: {app.ai_response}" diff --git a/tests/test_log_management_ui.py b/tests/test_log_management_ui.py index 4b53504..c93aa2f 100644 --- a/tests/test_log_management_ui.py +++ b/tests/test_log_management_ui.py @@ -24,26 +24,26 @@ from gui_2 import App @pytest.fixture def mock_config(tmp_path): config_path = tmp_path / "config.toml" - config_path.write_text("[projects] + config_path.write_text("""[projects] paths = [] active = "" [ai] provider = "gemini" model = "model" -", encoding="utf-8") +""", encoding="utf-8") return config_path @pytest.fixture def mock_project(tmp_path): project_path = tmp_path / "project.toml" - project_path.write_text("[project] + project_path.write_text("""[project] name = "test" [discussion] roles = ["User", "AI"] active = "main" [discussion.discussions.main] history = [] -", encoding="utf-8") +""", encoding="utf-8") return project_path def test_log_management_init(mock_config, mock_project, monkeypatch): diff --git a/tests/test_process_pending_gui_tasks.py b/tests/test_process_pending_gui_tasks.py index 369d07e..1f353fc 100644 --- a/tests/test_process_pending_gui_tasks.py +++ b/tests/test_process_pending_gui_tasks.py @@ -27,7 +27,7 @@ def test_redundant_calls_in_process_pending_gui_tasks(app_instance): {'action': 'set_value', 'item': 'current_provider', 'value': 'anthropic'} ] - with patch('ai_client.set_provider') as mock_set_provider, + with patch('ai_client.set_provider') as mock_set_provider, \ patch('ai_client.reset_session') as mock_reset_session: # We need to make sure the property setter's internal calls are also tracked or mocked.