d28e373e54
Root cause discovered after the user's batched test run revealed the
stress test still failed when run after the execution test. The
gemini_cli_adapter persists session_id across tests (singleton). The
execution test set session_id to 'mock-worker-ticket-A-1' (from the
worker call). When the stress test's epic call ran, it used
--resume with that stale session_id. The mock's worker check had
a session_id fallback:
if 'You are assigned to Ticket' in prompt or session_id.startswith('mock-worker-'):
...worker response...
The fallback incorrectly matched the stress test's epic call
(which used the stale worker session_id), causing the mock to return
a worker response instead of an epic response. The production's
generate_tracks then failed to parse the response, returning 0 tracks.
Fix: remove the session_id.startswith('mock-worker-') fallback. Route
workers based on prompt content only. The session_id is for the
production's session management, not for the mock's routing.
This is a 'fix the test infrastructure' change (the mock is a test
artifact, not production). The production's gemini_cli_adapter could
also be fixed to reset session_id on reset_session(), but that's
out of scope for this track.
Verified: the failing test combination (execution test before
stress test) was reproduced and the fix resolves it. The isolated
stress test still passes (3 consecutive runs).
Note: a separate issue was discovered where self.tracks is being
replaced between track appends (different id(self.tracks) values
in the diagnostic log). This causes the API to read 0 tracks after
the accept. The root cause is unclear from this session's
investigation; it appears to be a production code issue where the
in-memory track state is being overwritten by a disk read from
a different project path. This is documented as a follow-up.
176 lines
6.6 KiB
Python
176 lines
6.6 KiB
Python
import sys
|
|
import json
|
|
import os
|
|
|
|
# Persistent call counter (file-based so the mock survives across subprocess
|
|
# invocations). The mock gemini CLI is a short-lived subprocess invoked once
|
|
# per send() call; the session_id set by the adapter (--resume) tells the
|
|
# mock which response to return. Path is relative to the repo root (the test
|
|
# fixture sets subprocess cwd to tests/artifacts/live_gui_workspace_<ts>/ but
|
|
# the mock is invoked from the project root by its absolute path).
|
|
_CALL_COUNT_FILE = os.path.join(
|
|
os.path.dirname(os.path.abspath(__file__)),
|
|
"..", "artifacts", ".mock_concurrent_mma_call_count",
|
|
)
|
|
_CALL_COUNT_FILE = os.path.abspath(_CALL_COUNT_FILE)
|
|
|
|
def _next_call_count() -> int:
|
|
"""Atomically increment and return the per-test mock call count."""
|
|
try:
|
|
n = 0
|
|
if os.path.exists(_CALL_COUNT_FILE):
|
|
with open(_CALL_COUNT_FILE, "r", encoding="utf-8") as f:
|
|
n = int((f.read() or "0").strip() or "0")
|
|
n += 1
|
|
os.makedirs(os.path.dirname(_CALL_COUNT_FILE), exist_ok=True)
|
|
with open(_CALL_COUNT_FILE, "w", encoding="utf-8") as f:
|
|
f.write(str(n))
|
|
return n
|
|
except Exception:
|
|
return 0
|
|
|
|
def main() -> None:
|
|
# Read prompt from stdin
|
|
try:
|
|
prompt = sys.stdin.read()
|
|
except Exception:
|
|
prompt = ""
|
|
|
|
# Detect the session we're "resuming" via --resume arg (set by the
|
|
# gemini_cli_adapter on subsequent calls).
|
|
session_id = ""
|
|
argv = sys.argv[1:]
|
|
if "--resume" in argv:
|
|
i = argv.index("--resume")
|
|
if i + 1 < len(argv):
|
|
session_id = argv[i + 1]
|
|
|
|
call_n = _next_call_count()
|
|
try:
|
|
with open(b"C:\\projects\\manual_slop_tier2\\tests\\artifacts\\tier2_state\\fix_mma_concurrent_tracks_sim_20260627\\mock_diag.log", "ab") as _df:
|
|
_df.write(f"[MOCK] call_n={call_n} session_id={session_id!r}\n".encode())
|
|
except Exception: pass
|
|
|
|
# 1. Sprint Planning (different tickets for different tracks)
|
|
# Route on prompt content (the production passes the track_brief which
|
|
# contains "Track A" or "Track B"). The prior session_id-based routing was
|
|
# fragile because:
|
|
# 1. The call_n counter is shared across tests in the same session, so
|
|
# call_n != 2 for the 1st sprint if a prior test ran.
|
|
# 2. session_id="mock-sprint-A" means "this is a follow-up call after
|
|
# the 1st sprint returned mock-sprint-A", so the response should be
|
|
# sprint-B (2nd track), not sprint-A.
|
|
# CHECK BEFORE epic so sprint takes priority over the catch-all epic branch.
|
|
if 'generate the implementation tickets' in prompt:
|
|
if "Track A" in prompt: track_label = "A"
|
|
elif "Track B" in prompt: track_label = "B"
|
|
elif "Track C" in prompt: track_label = "C"
|
|
else: track_label = "A"
|
|
_emit_sprint_ticket(track_label)
|
|
try:
|
|
with open(b"C:\\projects\\manual_slop_tier2\\tests\\artifacts\\tier2_state\\fix_mma_concurrent_tracks_sim_20260627\\mock_diag.log", "ab") as _df:
|
|
_df.write(f"[MOCK] ROUTED TO: sprint track={track_label}\n".encode())
|
|
except Exception: pass
|
|
return
|
|
|
|
# 2. Worker Execution
|
|
# CHECK BEFORE epic so worker takes priority over the catch-all epic branch.
|
|
if 'You are assigned to Ticket' in prompt:
|
|
# NOTE: Removed session_id.startswith("mock-worker-") fallback. The session_id
|
|
# persists across tests in the same session (gemini_cli_adapter is a singleton).
|
|
# The fallback caused test_mma_concurrent_tracks_stress_sim to fail when it ran
|
|
# AFTER test_mma_concurrent_tracks_execution: the execution test set the session_id
|
|
# to mock-worker-ticket-A-1, and the stress test's epic call used --resume with that
|
|
# session_id, which the fallback incorrectly matched, returning a worker response
|
|
# instead of an epic response.
|
|
import re
|
|
match = re.search(r'Ticket (ticket-[A-Ba-b]-1)', prompt, re.IGNORECASE)
|
|
if match:
|
|
tid = match.group(1)
|
|
elif session_id.startswith("mock-worker-"):
|
|
tid = session_id[len("mock-worker-"):]
|
|
else:
|
|
tid = "unknown"
|
|
|
|
try:
|
|
with open(b"C:\\projects\\manual_slop_tier2\\tests\\artifacts\\tier2_state\\fix_mma_concurrent_tracks_sim_20260627\\mock_diag.log", "ab") as _df:
|
|
_df.write(f"[MOCK] ROUTED TO: worker tid={tid}\n".encode())
|
|
except Exception: pass
|
|
print(json.dumps({
|
|
"type": "message",
|
|
"role": "assistant",
|
|
"content": f"Working on {tid}. Done."
|
|
}), flush=True)
|
|
print(json.dumps({
|
|
"type": "result",
|
|
"status": "success",
|
|
"stats": {"total_tokens": 50, "input_tokens": 25, "output_tokens": 25},
|
|
"session_id": f"mock-worker-{tid}"
|
|
}), flush=True)
|
|
return
|
|
|
|
# 3. Epic Initialization (catch-all for any non-empty prompt that
|
|
# does not match the sprint or worker patterns above). This makes the
|
|
# mock robust to test-specific epic prompts (e.g. 'STRESS TEST: TRACK A
|
|
# AND TRACK B' used by test_mma_concurrent_tracks_stress_sim). The
|
|
# prior version only matched 'PATH: Epic Initialization', so other
|
|
# prompts fell to the Default branch and the production failed to parse
|
|
# the response as JSON, returning 0 tracks.
|
|
if prompt.strip():
|
|
mock_response = [
|
|
{"id": "track-a", "goal": "Track A Goal", "title": "Track A"},
|
|
{"id": "track-b", "goal": "Track B Goal", "title": "Track B"}
|
|
]
|
|
print(json.dumps({
|
|
"type": "message",
|
|
"role": "assistant",
|
|
"content": json.dumps(mock_response)
|
|
}), flush=True)
|
|
print(json.dumps({
|
|
"type": "result",
|
|
"status": "success",
|
|
"stats": {"total_tokens": 100, "input_tokens": 50, "output_tokens": 50},
|
|
"session_id": "mock-epic"
|
|
}), flush=True)
|
|
try:
|
|
with open(b"C:\\projects\\manual_slop_tier2\\tests\\artifacts\\tier2_state\\fix_mma_concurrent_tracks_sim_20260627\\mock_diag.log", "ab") as _df:
|
|
_df.write(b"[MOCK] ROUTED TO: epic_catchall\n")
|
|
except Exception: pass
|
|
return
|
|
|
|
# Default
|
|
try:
|
|
with open(b"C:\\projects\\manual_slop_tier2\\tests\\artifacts\\tier2_state\\fix_mma_concurrent_tracks_sim_20260627\\mock_diag.log", "ab") as _df:
|
|
_df.write(b"[MOCK] ROUTED TO: default\n")
|
|
except Exception: pass
|
|
print(json.dumps({
|
|
"type": "message",
|
|
"role": "assistant",
|
|
"content": f"Mock response. Received prompt: {prompt[:100]}..."
|
|
}), flush=True)
|
|
print(json.dumps({
|
|
"type": "result",
|
|
"status": "success",
|
|
"stats": {"total_tokens": 10, "input_tokens": 5, "output_tokens": 5},
|
|
"session_id": "mock-default"
|
|
}), flush=True)
|
|
|
|
def _emit_sprint_ticket(track_label: str) -> None:
|
|
mock_response = [
|
|
{"id": f"ticket-{track_label}-1", "description": f"Ticket {track_label} 1", "status": "todo", "assigned_to": "worker", "depends_on": []}
|
|
]
|
|
print(json.dumps({
|
|
"type": "message",
|
|
"role": "assistant",
|
|
"content": json.dumps(mock_response)
|
|
}), flush=True)
|
|
print(json.dumps({
|
|
"type": "result",
|
|
"status": "success",
|
|
"stats": {"total_tokens": 100, "input_tokens": 50, "output_tokens": 50},
|
|
"session_id": f"mock-sprint-{track_label}"
|
|
}), flush=True)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|