checkpoint: massive refactor

2026-02-28 09:06:45 -05:00
parent f2512c30e9
commit d36632c21a
149 changed files with 16255 additions and 17722 deletions
@@ -7,137 +7,113 @@ import json

@pytest.mark.asyncio
 async def test_headless_verification_full_run():
-    """
+ """
    1. Initialize a ConductorEngine with a Track containing multiple dependent Tickets.
    2. Simulate a full execution run using engine.run_linear().
    3. Mock ai_client.send to simulate successful tool calls and final responses.
    4. Specifically verify that 'Context Amnesia' is maintained.
    """
-    t1 = Ticket(id="T1", description="Task 1", status="todo", assigned_to="worker1")
-    t2 = Ticket(id="T2", description="Task 2", status="todo", assigned_to="worker1", depends_on=["T1"])
-    track = Track(id="track_verify", description="Verification Track", tickets=[t1, t2])
-    
-    from events import AsyncEventQueue
-    queue = AsyncEventQueue()
-    engine = ConductorEngine(track=track, event_queue=queue)
-
-    with patch("ai_client.send") as mock_send, \
-         patch("ai_client.reset_session") as mock_reset:
-        
-        # We need mock_send to return something that doesn't contain "BLOCKED"
-        mock_send.return_value = "Task completed successfully."
-        
-        await engine.run_linear()
-        
-        # Verify both tickets are completed
-        assert t1.status == "completed"
-        assert t2.status == "completed"
-        
-        # Verify that ai_client.send was called twice (once for each ticket)
-        assert mock_send.call_count == 2
-        
-        # Verify Context Amnesia: reset_session should be called for each ticket
-        assert mock_reset.call_count == 2
+ t1 = Ticket(id="T1", description="Task 1", status="todo", assigned_to="worker1")
+ t2 = Ticket(id="T2", description="Task 2", status="todo", assigned_to="worker1", depends_on=["T1"])
+ track = Track(id="track_verify", description="Verification Track", tickets=[t1, t2])
+ from events import AsyncEventQueue
+ queue = AsyncEventQueue()
+ engine = ConductorEngine(track=track, event_queue=queue)
+ with patch("ai_client.send") as mock_send, \
+ patch("ai_client.reset_session") as mock_reset:
+ # We need mock_send to return something that doesn't contain "BLOCKED"
+  mock_send.return_value = "Task completed successfully."
+  await engine.run_linear()
+  # Verify both tickets are completed
+  assert t1.status == "completed"
+  assert t2.status == "completed"
+  # Verify that ai_client.send was called twice (once for each ticket)
+  assert mock_send.call_count == 2
+  # Verify Context Amnesia: reset_session should be called for each ticket
+  assert mock_reset.call_count == 2

@pytest.mark.asyncio
 async def test_headless_verification_error_and_qa_interceptor():
-    """
+ """
    5. Simulate a shell error and verify that the Tier 4 QA interceptor is triggered 
       and its summary is injected into the worker's history for the next retry.
    """
-    t1 = Ticket(id="T1", description="Task with error", status="todo", assigned_to="worker1")
-    track = Track(id="track_error", description="Error Track", tickets=[t1])
-    
-    from events import AsyncEventQueue
-    queue = AsyncEventQueue()
-    engine = ConductorEngine(track=track, event_queue=queue)
+ t1 = Ticket(id="T1", description="Task with error", status="todo", assigned_to="worker1")
+ track = Track(id="track_error", description="Error Track", tickets=[t1])
+ from events import AsyncEventQueue
+ queue = AsyncEventQueue()
+ engine = ConductorEngine(track=track, event_queue=queue)
+ # We need to simulate the tool loop inside ai_client._send_gemini (or similar)
+ # Since we want to test the real tool loop and QA injection, we mock at the provider level.
+ with patch("ai_client._provider", "gemini"), \
+ patch("ai_client._gemini_client") as mock_genai_client, \
+ patch("ai_client.confirm_and_run_callback") as mock_run, \
+ patch("ai_client.run_tier4_analysis") as mock_qa, \
+ patch("ai_client._ensure_gemini_client") as mock_ensure, \
+ patch("ai_client._gemini_tool_declaration", return_value=None):
+ # Ensure _gemini_client is restored by the mock ensure function
+  import ai_client

-    # We need to simulate the tool loop inside ai_client._send_gemini (or similar)
-    # Since we want to test the real tool loop and QA injection, we mock at the provider level.
-    
-    with patch("ai_client._provider", "gemini"), \
-         patch("ai_client._gemini_client") as mock_genai_client, \
-         patch("ai_client.confirm_and_run_callback") as mock_run, \
-         patch("ai_client.run_tier4_analysis") as mock_qa, \
-         patch("ai_client._ensure_gemini_client") as mock_ensure, \
-         patch("ai_client._gemini_tool_declaration", return_value=None):
-        
-        # Ensure _gemini_client is restored by the mock ensure function
-        import ai_client
-        def restore_client():
-            ai_client._gemini_client = mock_genai_client
-        mock_ensure.side_effect = restore_client
-        ai_client._gemini_client = mock_genai_client
-        
-        # Mocking Gemini chat response
-        mock_chat = MagicMock()
-        mock_genai_client.chats.create.return_value = mock_chat
-        
-        # Mock count_tokens to avoid chat creation failure
-        mock_count_resp = MagicMock()
-        mock_count_resp.total_tokens = 100
-        mock_genai_client.models.count_tokens.return_value = mock_count_resp
+  def restore_client():
+   ai_client._gemini_client = mock_genai_client
+  mock_ensure.side_effect = restore_client
+  ai_client._gemini_client = mock_genai_client
+  # Mocking Gemini chat response
+  mock_chat = MagicMock()
+  mock_genai_client.chats.create.return_value = mock_chat
+  # Mock count_tokens to avoid chat creation failure
+  mock_count_resp = MagicMock()
+  mock_count_resp.total_tokens = 100
+  mock_genai_client.models.count_tokens.return_value = mock_count_resp
+  # 1st round: tool call to run_powershell
+  mock_part1 = MagicMock()
+  mock_part1.text = "I will run a command."
+  mock_part1.function_call = MagicMock()
+  mock_part1.function_call.name = "run_powershell"
+  mock_part1.function_call.args = {"script": "dir"}
+  mock_resp1 = MagicMock()
+  mock_resp1.candidates = [MagicMock(content=MagicMock(parts=[mock_part1]), finish_reason=MagicMock(name="STOP"))]
+  mock_resp1.usage_metadata.prompt_token_count = 10
+  mock_resp1.usage_metadata.candidates_token_count = 5
+  # 2nd round: Final text after tool result
+  mock_part2 = MagicMock()
+  mock_part2.text = "The command failed but I understand why. Task done."
+  mock_part2.function_call = None
+  mock_resp2 = MagicMock()
+  mock_resp2.candidates = [MagicMock(content=MagicMock(parts=[mock_part2]), finish_reason=MagicMock(name="STOP"))]
+  mock_resp2.usage_metadata.prompt_token_count = 20
+  mock_resp2.usage_metadata.candidates_token_count = 10
+  mock_chat.send_message.side_effect = [mock_resp1, mock_resp2]
+  # Mock run_powershell behavior: it should call the qa_callback on error

-        # 1st round: tool call to run_powershell
-        mock_part1 = MagicMock()
-        mock_part1.text = "I will run a command."
-        mock_part1.function_call = MagicMock()
-        mock_part1.function_call.name = "run_powershell"
-        mock_part1.function_call.args = {"script": "dir"}
-        
-        mock_resp1 = MagicMock()
-        mock_resp1.candidates = [MagicMock(content=MagicMock(parts=[mock_part1]), finish_reason=MagicMock(name="STOP"))]
-        mock_resp1.usage_metadata.prompt_token_count = 10
-        mock_resp1.usage_metadata.candidates_token_count = 5
-        
-        # 2nd round: Final text after tool result
-        mock_part2 = MagicMock()
-        mock_part2.text = "The command failed but I understand why. Task done."
-        mock_part2.function_call = None
-        
-        mock_resp2 = MagicMock()
-        mock_resp2.candidates = [MagicMock(content=MagicMock(parts=[mock_part2]), finish_reason=MagicMock(name="STOP"))]
-        mock_resp2.usage_metadata.prompt_token_count = 20
-        mock_resp2.usage_metadata.candidates_token_count = 10
-        
-        mock_chat.send_message.side_effect = [mock_resp1, mock_resp2]
-        
-        # Mock run_powershell behavior: it should call the qa_callback on error
-        def run_side_effect(script, base_dir, qa_callback):
-            if qa_callback:
-                analysis = qa_callback("Error: file not found")
-                return f"""STDERR: Error: file not found
+  def run_side_effect(script, base_dir, qa_callback):
+   if qa_callback:
+    analysis = qa_callback("Error: file not found")
+    return f"""STDERR: Error: file not found

 QA ANALYSIS:
 {analysis}"""
-            return "Error: file not found"
-            
-        mock_run.side_effect = run_side_effect
-        mock_qa.return_value = "FIX: Check if path exists."
-        
-        await engine.run_linear()
-        
-        # Verify QA analysis was triggered
-        mock_qa.assert_called_once_with("Error: file not found")
-        
-        # Verify the 2nd send_message call includes the QA ANALYSIS in its payload (f_resps)
-        # The first call is the user message, the second is the tool response.
-        assert mock_chat.send_message.call_count == 2
-        args, kwargs = mock_chat.send_message.call_args_list[1]
-        f_resps = args[0]
-        print(f"DEBUG f_resps: {f_resps}")
-        
-        # f_resps is expected to be a list of Part objects (from google.genai.types)
-        # Since we're mocking, they might be MagicMocks or actual objects if types is used.
-        # In our case, ai_client.Part.from_function_response is used.
-        
-        found_qa = False
-        for part in f_resps:
-            # Check if it's a function response and contains our QA analysis
-            # We need to be careful with how google.genai.types.Part is structured or mocked
-            part_str = str(part)
-            print(f"DEBUG part_str: {part_str}")
-            if "QA ANALYSIS:" in part_str and "FIX: Check if path exists." in part_str:
-                found_qa = True
-        
-        assert found_qa, "QA Analysis was not injected into the next round"
+   return "Error: file not found"
+  mock_run.side_effect = run_side_effect
+  mock_qa.return_value = "FIX: Check if path exists."
+  await engine.run_linear()
+  # Verify QA analysis was triggered
+  mock_qa.assert_called_once_with("Error: file not found")
+  # Verify the 2nd send_message call includes the QA ANALYSIS in its payload (f_resps)
+  # The first call is the user message, the second is the tool response.
+  assert mock_chat.send_message.call_count == 2
+  args, kwargs = mock_chat.send_message.call_args_list[1]
+  f_resps = args[0]
+  print(f"DEBUG f_resps: {f_resps}")
+  # f_resps is expected to be a list of Part objects (from google.genai.types)
+  # Since we're mocking, they might be MagicMocks or actual objects if types is used.
+  # In our case, ai_client.Part.from_function_response is used.
+  found_qa = False
+  for part in f_resps:
+  # Check if it's a function response and contains our QA analysis
+  # We need to be careful with how google.genai.types.Part is structured or mocked
+   part_str = str(part)
+   print(f"DEBUG part_str: {part_str}")
+   if "QA ANALYSIS:" in part_str and "FIX: Check if path exists." in part_str:
+    found_qa = True
+  assert found_qa, "QA Analysis was not injected into the next round"