conductor(track): initialize test_batching_post_refactor_polish_20260607 spec/plan/state

2026-06-08 00:27:32 -04:00
parent 828050ae4f
commit 2b56ab3c5c
4 changed files with 1256 additions and 0 deletions
@@ -0,0 +1,92 @@
+{
+  "track_id": "test_batching_post_refactor_polish_20260607",
+  "name": "Test Batching — Post-Refactor Polish",
+  "initialized": "2026-06-08",
+  "owner": "tier2-tech-lead",
+  "priority": "medium",
+  "status": "active",
+  "type": "developer tooling + observability polish",
+  "scope": {
+    "new_files": [
+      "scripts/test_failure_parser.py",
+      "tests/test_test_failure_parser.py",
+      "tests/test_live_gui_foregrounding.py"
+    ],
+    "modified_files": [
+      "scripts/run_tests_batched.py",
+      "tests/conftest.py",
+      "tests/test_command_palette_sim.py",
+      "tests/test_workflow_sim.py",
+      "tests/test_undo_redo_sim.py"
+    ],
+    "deleted_files": "~45 scratch files in tests/artifacts/ (after reference verification)"
+  },
+  "blocked_by": {
+    "test_batching_refactor_20260606": "must be SHIPPED before this track begins; the new orchestrator's _run_batch is the integration point"
+  },
+  "blocks": [],
+  "estimated_phases": 5,
+  "spec": "spec.md",
+  "plan": "plan.md",
+  "current_state_audit_commit": "2db14361",
+  "current_state_audit": {
+    "already_implemented": [
+      "App._diag_layout_state() at src/gui_2.py:507-544 (commit 818537b3) — logs show_windows count, visible defaults, stale window name warnings",
+      "manualslop_layout_default.ini at tests/artifacts/manualslop_layout_default.ini (2,699 bytes; whitelisted in .gitignore line 17)",
+      "tests/conftest.py:418-421 copies the layout artifact into the test workspace (replaces the prior 'do NOT copy' block from 7a4f71e7)",
+      "_default_windows updated at src/app_controller.py:1832-1855 (MMA Dashboard=False, Log Management=True, Diagnostics=True)",
+      "_STALE_WINDOW_NAMES set at src/gui_2.py:530-533 (10 names; Theme removed)",
+      "Skip markers from e09e6823 resolved in 8d58d7fc (warmup races), a36aad50 (gui_events_v2), 91b34ae8 (live_gui_filedialog), ff523f7e (project_switch_persona)",
+      "RUN_MMA_INTEGRATION env-var gate at tests/test_mma_step_mode_sim.py:24-27 (opt-in integration gate, not a broken test)",
+      "scripts/cleanup_orphaned_processes.py (commit 5e1867bb) — manages stale subprocesses; preserves MCP servers"
+    ],
+    "gaps_to_fill": [
+      "New orchestrator (post-refactor) uses subprocess.run(capture_output=True) and only prints stdout tail on failure — no per-file failure list (regression in failure visibility vs current)",
+      "_extract_failed_files (if implemented in refactor's Phase 0) is in the LEGACY script that gets renamed to .legacy in refactor's Phase 3, then deleted in Phase 4; needs to be lifted to a shared location",
+      "live_gui fixture doesn't bring sloppy.py's window to front (conftest.py:live_gui)",
+      "live_gui tests have no per-test focus signal",
+      "tests/artifacts/ has ~45 scratch files (gitignored, but clutter the directory)"
+    ]
+  },
+  "verification_criteria": [
+    "scripts/test_failure_parser.py exists and exports extract_failed_files (no re import; grep returns empty)",
+    "11+ unit tests in tests/test_test_failure_parser.py all pass",
+    "Legacy run_tests_batched.py (if not yet deleted by refactor) imports extract_failed_files from the new module",
+    "New run_tests_batched.py _run_batch calls extract_failed_files on captured output; per-file failure list in SUMMARY",
+    "tests/conftest.py:_foreground_subprocess_window exists; 3 unit tests pass; live_gui fixture calls it after subprocess.Popen",
+    "tests/conftest.py:focus_test_panel exists; 3+ *_sim.py tests call it in setup",
+    "Scratch files from FR-19 deleted; directory contains only the preserved files/directories from FR-20",
+    "Existing test suite still passes for batches 1-4 (no regressions)",
+    "Batch 5's timeout (test_z_negative_flows) reported as exactly 1 failed file, not all 42",
+    "All commits atomic per-task with descriptive messages",
+    "No commits include the user's TOML files (config.toml, project.toml, project_history.toml)",
+    "No commits include manualslop_layout.ini at the repo root"
+  ],
+  "anti_patterns_to_avoid": [
+    "DO NOT use the native edit tool on .py files (destroys 1-space indent; use manual-slop_edit_file or manual-slop_py_update_definition)",
+    "DO NOT use git restore / git checkout -- <file> / git reset without explicit user permission in the same message (HARD BAN)",
+    "DO NOT commit the user's TOML files",
+    "DO NOT add re (regex) to the failure parser (AGENTS.md standing ban)",
+    "DO NOT add per-file re-run logic to the orchestrator",
+    "DO NOT add inline comments to source code (docstrings are fine)",
+    "DO NOT add new external dependencies (no pyproject.toml change)",
+    "DO NOT use mock patches to pseudo API calls or hooks when the app source changes (adapt tests properly)"
+  ],
+  "links": {
+    "spec": "spec.md",
+    "plan": "plan.md",
+    "parent_track": "conductor/tracks/test_batching_refactor_20260606/",
+    "upstream_audit": "conductor/tracks/startup_speedup_20260606/state.toml (conftest_warmup_wait)",
+    "architecture_docs": [
+      "docs/guide_architecture.md",
+      "docs/guide_testing.md",
+      "docs/guide_api_hooks.md",
+      "docs/guide_simulations.md"
+    ],
+    "policy_docs": [
+      "AGENTS.md (no regex, no native edit, no git restore without permission)",
+      "conductor/workflow.md (Skip-Marker Policy, Phase Completion Verification)",
+      "conductor/product-guidelines.md (1-space indent, no comments, type hints)"
+    ]
+  }
+}
@@ -0,0 +1,845 @@
+# Test Batching — Post-Refactor Polish Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Polish the test batching orchestrator and live_gui fixture AFTER `test_batching_refactor_20260606` ships. Deliver: (1) shared `_extract_failed_files` library used by both the legacy and new orchestrators, (2) per-file failure list in the new orchestrator's SUMMARY, (3) `live_gui` subprocess window foregrounding, (4) `focus_test_panel` helper wired into 3 starter sims, (5) `tests/artifacts/` scratch cleanup.
+
+**Architecture:** New `scripts/test_failure_parser.py` module (str-ops-only FAILED-line parser, no regex). New module-level functions in `tests/conftest.py` (lazy-import `win32gui`, `ApiHookClient`). Surgical edits to the post-refactor `scripts/run_tests_batched.py:_run_batch` to wire the parser into the SUMMARY. No new files in `src/`.
+
+**Tech Stack:** Python 3.11+ (stdlib `subprocess`, `os`, `sys`, `time`). `pywin32` (already a project dep; used lazily). `ApiHookClient` (existing).
+
+**Blocked by:** `test_batching_refactor_20260606` (must be SHIPPED — this plan reads from the new orchestrator's `_run_batch` and the legacy's `_extract_failed_files`).
+
+**Parent track:** None. **Child tracks:** None.
+
+---
+
+## Constraints (re-stated from the user's standing rules)
+
+- **Do NOT use the native `edit` tool on `.py` files.** It destroys 1-space indentation. Use `manual-slop_edit_file` (exact match), `manual-slop_set_file_slice` (single-line surgical only), or `manual-slop_py_update_definition` (function rewrites).
+- **Do NOT use `git restore`, `git checkout -- <file>`, or `git reset` without explicit user permission in the same message.** HARD BAN.
+- **Do NOT commit `config.toml`, `project.toml`, `project_history.toml`, or repo-root `manualslop_layout.ini`.** These are the user's. Stage and commit only the files listed in each task.
+- **Do NOT add `re` (regex) to the failure parser.** Use `str.startswith`, `str.find`, `str.split`, `str.replace`. Verify with `grep -n "import re\|from re" scripts/test_failure_parser.py` returning empty after Phase 1.
+- **1-space indentation for all Python code.** 2-space for class bodies. 0 leading spaces for module-level. CRLF line endings on Windows.
+- **Do NOT add inline comments to source code.** Docstrings are fine; `#` comments are not.
+- **Type hints required** for all new functions.
+
+---
+
+## Phase 1: Shared `_extract_failed_files` library
+
+Focus: Extract the FAILED-line parser to a shared module that both the legacy and new orchestrators can import. Str-ops-only contract, no regex, with comprehensive unit tests.
+
+**Files:**
+- Create: `scripts/test_failure_parser.py` (~35 lines)
+- Create: `tests/test_test_failure_parser.py` (~120 lines; 11 unit tests)
+- Modify: `scripts/run_tests_batched.py` (the post-refactor new orchestrator; if the legacy is still present and has a local copy, also update it)
+
+### Task 1.1: Red — add 11 unit tests for the shared parser
+
+**Files:** Create `tests/test_test_failure_parser.py`.
+
+- [ ] **Step 1: Write the failing test file**
+
+```python
+"""
+Unit tests for the FAILED-line parser in scripts/test_failure_parser.py.
+Shared by both the legacy run_tests_batched.py and the new orchestrator.
+Str-ops-only contract; no regex.
+"""
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "scripts"))
+
+import test_failure_parser as tfp
+
+
+def test_extract_empty():
+    assert tfp.extract_failed_files("") == []
+
+
+def test_extract_no_failed_lines():
+    out = "tests/test_foo.py .. [ 12%]\ntests/test_bar.py F [100%]\n===== 1 passed, 1 failed in 0.5s =====\n"
+    assert tfp.extract_failed_files(out) == []
+
+
+def test_extract_single_failed_line():
+    out = "FAILED tests/test_foo.py::test_bar - AssertionError: nope\n"
+    assert tfp.extract_failed_files(out) == ["test_foo.py"]
+
+
+def test_extract_multiple_failed_lines_same_file():
+    out = (
+        "FAILED tests/test_foo.py::test_a - AssertionError\n"
+        "FAILED tests/test_foo.py::test_b - AssertionError\n"
+    )
+    assert tfp.extract_failed_files(out) == ["test_foo.py"]
+
+
+def test_extract_multiple_failed_lines_different_files():
+    out = (
+        "FAILED tests/test_foo.py::test_a - AssertionError\n"
+        "FAILED tests/test_bar.py::test_b - AssertionError\n"
+    )
+    assert tfp.extract_failed_files(out) == ["test_foo.py", "test_bar.py"]
+
+
+def test_extract_failed_line_no_test_id():
+    out = "FAILED tests/test_foo.py - collection error\n"
+    assert tfp.extract_failed_files(out) == ["test_foo.py"]
+
+
+def test_extract_failed_line_windows_path():
+    out = "FAILED tests\\test_foo.py::test_bar - AssertionError\n"
+    assert tfp.extract_failed_files(out) == ["test_foo.py"]
+
+
+def test_extract_failed_line_class_method():
+    out = "FAILED tests/test_foo.py::TestClass::test_method - AssertionError\n"
+    assert tfp.extract_failed_files(out) == ["test_foo.py"]
+
+
+def test_extract_failed_line_parametrized():
+    out = "FAILED tests/test_foo.py::test_bar[1] - AssertionError\n"
+    assert tfp.extract_failed_files(out) == ["test_foo.py"]
+
+
+def test_extract_ignores_lines_that_contain_failed_but_dont_start_with_it():
+    out = "===== 1 failed, 2 passed in 0.5s =====\n"
+    assert tfp.extract_failed_files(out) == []
+
+
+def test_extract_real_pytest_summary_block():
+    out = (
+        "===== short test summary info =====\n"
+        "FAILED tests/test_alpha.py::test_one - AssertionError: 1 != 2\n"
+        "FAILED tests/test_alpha.py::test_two - AssertionError: 3 != 4\n"
+        "FAILED tests/test_beta.py::TestThing::test_x - TypeError\n"
+        "===== 3 failed, 5 passed in 1.2s =====\n"
+    )
+    assert tfp.extract_failed_files(out) == ["test_alpha.py", "test_beta.py"]
+```
+
+- [ ] **Step 2: Run the test, verify it FAILS (no module yet)**
+
+Run: `uv run pytest tests/test_test_failure_parser.py -v`
+Expected: ALL 11 tests FAIL with `ImportError: No module named 'test_failure_parser'`.
+
+- [ ] **Step 3: Commit the failing test (TDD red phase)**
+
+```powershell
+git add tests/test_test_failure_parser.py
+git commit -m "test(failure_parser): add 11 unit tests for shared FAILED-line parser"
+```
+
+### Task 1.2: Green — implement `extract_failed_files` in `scripts/test_failure_parser.py`
+
+**Files:** Create `scripts/test_failure_parser.py`.
+
+- [ ] **Step 1: Create the module**
+
+```python
+"""
+Shared FAILED-line parser for pytest output.
+
+Used by both scripts/run_tests_batched.py (the legacy and the new
+post-refactor orchestrator). Str-ops-only by design: no regex import
+per AGENTS.md standing ban across the codebase.
+
+Contract:
+  - Input: full captured stdout+stderr from a pytest invocation.
+  - Lines that begin with the literal 7-character prefix "FAILED "
+    (note the trailing space) are parsed for the test ID.
+  - The test ID portion ends at the first " - " (space-dash-space)
+    separator that introduces the error message.
+  - If the test ID contains "::", the file path is everything before
+    the first "::". Otherwise the test ID IS the file path.
+  - Backslashes are normalized to forward slashes (Windows safety).
+  - A leading "tests/" prefix is stripped so returned strings match
+    the bare filenames in the test file list.
+  - Returns the unique file paths in first-occurrence order.
+
+Lines that merely contain the substring "failed" (e.g. the
+"1 failed, 2 passed" summary footer) are NOT parsed.
+
+[C: scripts/run_tests_batched.py:_run_batch (post-refactor),
+ scripts/run_tests_batched.py:run_tests (legacy, if not yet
+ deleted by the refactor's Phase 4)]
+"""
+from __future__ import annotations
+
+_FAILED_PREFIX: str = "FAILED "
+
+
+def extract_failed_files(output: str) -> list[str]:
+ failed: list[str] = []
+ seen: set[str] = set()
+ for line in output.splitlines():
+  if not line.startswith(_FAILED_PREFIX):
+   continue
+  rest: str = line[len(_FAILED_PREFIX):]
+  dash_idx: int = rest.find(" - ")
+  test_id: str = rest if dash_idx == -1 else rest[:dash_idx]
+  colon_colon_idx: int = test_id.find("::")
+  filepath: str = test_id if colon_colon_idx == -1 else test_id[:colon_colon_idx]
+  filepath = filepath.replace("\\", "/")
+  if filepath.startswith("tests/"):
+   filepath = filepath[len("tests/"):]
+  if filepath and filepath not in seen:
+   seen.add(filepath)
+   failed.append(filepath)
+ return failed
+```
+
+- [ ] **Step 2: Run the test, verify it PASSES**
+
+Run: `uv run pytest tests/test_test_failure_parser.py -v`
+Expected: 11/11 PASS.
+
+- [ ] **Step 3: Verify no `re` import**
+
+Run: `grep -n "import re\|from re" scripts/test_failure_parser.py`
+Expected: no output (empty).
+
+- [ ] **Step 4: Commit the parser module**
+
+```powershell
+git add scripts/test_failure_parser.py
+git commit -m "feat(scripts): add shared test_failure_parser module (no regex)"
+```
+
+### Task 1.3: Wire the shared parser into the post-refactor orchestrator
+
+**Files:** Modify `scripts/run_tests_batched.py` (the new orchestrator from the refactor's Phase 3).
+
+This task assumes the refactor's Phase 3 is SHIPPED. The new orchestrator's `_run_batch` is at the section documented in the refactor's plan.md around line 1295-1308:
+```python
+def _run_batch(b: Batch, durations: dict[str, float]) -> tuple[int, float, dict[str, float]]:
+ if b.skip_reason:
+  return 0, 0.0, {}
+ cmd = ["uv", "run", "pytest", "-v", "--durations=0"] + b.pytest_args + [str(f) for f in b.files]
+ print(f"\n>>> Running {b.label} ({len(b.files)} files)")
+ t0 = time.monotonic()
+ proc = subprocess.run(cmd, capture_output=True, text=True)
+ elapsed = time.monotonic() - t0
+ new_durs = _parse_durations_from_pytest_output(proc.stdout)
+ print(proc.stdout[-2000:] if proc.returncode != 0 else f"<<< {b.label} PASS in {elapsed:.1f}s")
+ if proc.returncode != 0:
+  print(f"<<< {b.label} FAIL (exit {proc.returncode}) in {elapsed:.1f}s")
+ print(proc.stderr[-1000:])
+ return proc.returncode, elapsed, new_durs
+```
+
+- [ ] **Step 1: Add the import at the top of the new orchestrator**
+
+Read the current top of `scripts/run_tests_batched.py` (post-refactor) to identify the import block. Add:
+
+```python
+from scripts.test_failure_parser import extract_failed_files
+```
+
+- [ ] **Step 2: Refactor `_run_batch` to capture and surface per-file failure lists**
+
+Replace `_run_batch` with a version that:
+- Returns a `tuple[int, float, dict[str, float], list[str]]` (4-tuple; the 4th element is the per-file failure list)
+- On `returncode != 0`, calls `extract_failed_files(proc.stdout + "\n" + proc.stderr)` to get the actual failed files
+- On `subprocess.TimeoutExpired` (raised when the batch exceeds `--timeout` if the caller wraps with a timeout), fall back to all files in the batch with a `(timeout)` annotation
+- Returns `[]` for skipped batches or successful runs
+
+```python
+def _run_batch(
+ b: Batch,
+ durations: dict[str, float],
+ timeout: int | None = None,
+) -> tuple[int, float, dict[str, float], list[tuple[str, str]]]:
+ if b.skip_reason:
+  return 0, 0.0, {}, []
+ cmd = ["uv", "run", "pytest", "-v", "--durations=0"] + b.pytest_args + [str(f) for f in b.files]
+ print(f"\n>>> Running {b.label} ({len(b.files)} files)")
+ t0 = time.monotonic()
+ failed: list[tuple[str, str]] = []
+ try:
+  proc = subprocess.run(
+   cmd,
+   capture_output=True,
+   text=True,
+   timeout=timeout,
+  )
+  elapsed = time.monotonic() - t0
+  new_durs = _parse_durations_from_pytest_output(proc.stdout)
+  if proc.returncode == 0:
+   print(f"<<< {b.label} PASS in {elapsed:.1f}s")
+  else:
+   actual: list[str] = extract_failed_files(proc.stdout + "\n" + proc.stderr)
+   if actual:
+    for f in actual:
+     failed.append((f, ""))
+    print(f"<<< {b.label} FAIL (exit {proc.returncode}) in {elapsed:.1f}s; {len(actual)} actually-failed file(s)")
+   else:
+    for f in b.files:
+     failed.append((str(f), "(no FAILED lines; treating as batch failure)"))
+    print(f"<<< {b.label} FAIL (exit {proc.returncode}) in {elapsed:.1f}s; no FAILED lines found, listing whole batch")
+  return proc.returncode, elapsed, new_durs, failed
+ except subprocess.TimeoutExpired:
+  elapsed = time.monotonic() - t0
+  for f in b.files:
+   failed.append((str(f), "(timeout)"))
+  print(f"<<< {b.label} TIMED OUT after {elapsed:.1f}s (limit {timeout}s)")
+  return 1, elapsed, {}, failed
+```
+
+- [ ] **Step 3: Update `_print_summary` to display the per-file failure list**
+
+The refactor's `_print_summary` takes `results: list[tuple[Batch, int, float]]` (3-tuple). Update to 4-tuple and add the per-file listing:
+
+```python
+def _print_summary(results: list[tuple[Batch, int, float, list[tuple[str, str]]]]) -> int:
+ print("\n" + "=" * 60)
+ print("SUMMARY")
+ print("=" * 60)
+ worst: int = 0
+ any_failed: bool = False
+ for b, code, elapsed, failed in results:
+  if b.skip_reason:
+   status: str = "SKIPPED"
+  elif code == 0:
+   status = "PASS"
+  else:
+   status = "FAIL"
+   any_failed = True
+  worst = max(worst, code)
+  n: int = len(b.files)
+  print(f"[{b.tier}] {b.label:40s} {status:8s} {n} files {elapsed:6.1f}s")
+  for f, note in failed:
+   suffix: str = f"  {note}" if note else ""
+   print(f"   - {f}{suffix}")
+ return 1 if any_failed else worst
+```
+
+- [ ] **Step 4: Update the `main()` callsite to thread the 4-tuple through**
+
+Find the loop in `main()` that calls `_run_batch` and accumulates results. Change the tuple unpacking from 3-tuple to 4-tuple and pass the `failed` list to `_print_summary`.
+
+Before:
+```python
+for b in batches:
+ code, elapsed, new_durs = _run_batch(b, merged_durations)
+ results.append((b, code, elapsed))
+```
+
+After:
+```python
+timeout_arg: int | None = options.timeout
+for b in batches:
+ code, elapsed, new_durs, failed = _run_batch(b, merged_durations, timeout=timeout_arg)
+ results.append((b, code, elapsed, failed))
+```
+
+Also add a `--timeout` argument to the `argparse.ArgumentParser` in `main()` (the refactor's spec doesn't have one; default 600s = 10 minutes per batch):
+
+```python
+p.add_argument("--timeout", type=int, default=600, help="seconds per batch (default: 600)")
+```
+
+- [ ] **Step 5: Verify the script still parses and the new tests pass**
+
+Run: `uv run pytest tests/test_test_failure_parser.py -v`
+Expected: 11/11 PASS.
+
+Run: `uv run python scripts/run_tests_batched.py --plan --tiers 1 2>&1 | head -20`
+Expected: prints tier-1 batches (no execution; just plan output).
+
+- [ ] **Step 6: Run a small tier-1 batch end-to-end to confirm the new path works**
+
+Run: `uv run python scripts/run_tests_batched.py --tiers 1 --no-xdist 2>&1 | tail -30`
+Expected: runs the unit tier; SUMMARY table printed; if any tests fail, the per-file failure list is shown under the failing tier.
+
+- [ ] **Step 7: Commit the integration**
+
+```powershell
+git add scripts/run_tests_batched.py
+git commit -m "feat(orchestrator): wire shared failure parser into _run_batch; per-file SUMMARY"
+```
+
+### Task 1.4: Conductor — User Manual Verification (Phase 1)
+
+- [ ] **Step 1: Run the unit tests**
+
+  Run: `uv run pytest tests/test_test_failure_parser.py -v`
+  Expected: 11/11 PASS.
+
+- [ ] **Step 2: Run a small tier with a deliberate failure to confirm end-to-end**
+
+  Create a temporary failing test:
+  ```python
+  # tests/test_zzz_fake_failure.py
+  def test_zzz_fake_failure():
+      assert False, "intentional failure"
+  ```
+
+  Run: `uv run python scripts/run_tests_batched.py --tiers 1 --no-xdist 2>&1 | tail -30`
+  Expected: SUMMARY shows the tier failed, the per-file listing shows `test_zzz_fake_failure.py`. Then delete the temp file.
+
+  If the run fails: capture the output to a log file and spawn a Tier 4 QA agent. Do not attempt more than 2 fix cycles; if still failing, report and stop.
+
+- [ ] **Step 3: PAUSE and present verification result**
+
+  > "Phase 1 verification: 11/11 unit tests pass; end-to-end run on tier 1 with a deliberate failure shows the file in the per-file listing. Ready to commit Phase 1 checkpoint and move to Phase 2? (yes / changes needed)"
+
+- [ ] **Step 4: Create the Phase 1 checkpoint**
+
+  Capture the most recent commit hash. Attach a git note. Update `plan.md` Phase 1 status to `[x]` and append the hash.
+
+  ```powershell
+  git notes add -m "Phase 1 of test_batching_post_refactor_polish_20260607: shared scripts/test_failure_parser.py with 11 unit tests; integrated into new orchestrator's _run_batch + SUMMARY. Per-file failure list now surfaced for non-zero exits; whole-batch fallback on timeout or no-FAILED-lines." <commit_sha>
+  ```
+
+---
+
+## Phase 2: `live_gui` Window Foregrounding
+
+Focus: Add `_foreground_subprocess_window` helper to `tests/conftest.py` and wire it into the `live_gui` fixture. Str-ops-only contract; no regex; lazy-import `win32gui`/`win32con`; never raises.
+
+**Files:**
+- Modify: `tests/conftest.py` (add helper + call from fixture)
+- Create: `tests/test_live_gui_foregrounding.py` (3 unit tests)
+
+### Task 2.1: Red — add unit tests for the foregrounding helper
+
+**Files:** Create `tests/test_live_gui_foregrounding.py`.
+
+- [ ] **Step 1: Write the failing test file**
+
+```python
+"""
+Unit tests for the sloppy.py window-foregrounding helper in
+tests/conftest.py. Platform-dispatched: Windows uses win32gui;
+non-Windows is a no-op. Tests must not require a real GUI subprocess.
+"""
+import os
+import sys
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+
+import conftest
+
+
+def test_foreground_helper_exists():
+ assert hasattr(conftest, "_foreground_subprocess_window")
+ assert callable(conftest._foreground_subprocess_window)
+
+
+def test_foreground_helper_noop_on_invalid_pid():
+ conftest._foreground_subprocess_window(pid=0)
+ conftest._foreground_subprocess_window(pid=0xFFFFFFFE)
+
+
+def test_foreground_helper_noop_when_win32gui_unavailable(monkeypatch):
+ real_import = __builtins__.__import__ if hasattr(__builtins__, "__import__") else __import__
+
+ def fake_import(name, *args, **kwargs):
+  if name in ("win32gui", "win32con"):
+   raise ImportError(f"simulated missing {name}")
+  return real_import(name, *args, **kwargs)
+
+ monkeypatch.setattr("builtins.__import__", fake_import)
+ conftest._foreground_subprocess_window(pid=0)
+```
+
+- [ ] **Step 2: Run the test, verify it FAILS**
+
+Run: `uv run pytest tests/test_live_gui_foregrounding.py -v`
+Expected: ALL 3 FAIL with `AttributeError: module 'conftest' has no attribute '_foreground_subprocess_window'`.
+
+- [ ] **Step 3: Commit the failing test**
+
+```powershell
+git add tests/test_live_gui_foregrounding.py
+git commit -m "test(fixture): add unit tests for live_gui window-foregrounding helper"
+```
+
+### Task 2.2: Green — implement `_foreground_subprocess_window` in `tests/conftest.py`
+
+**Files:** Modify `tests/conftest.py` (add module-level function after imports, before any fixture).
+
+- [ ] **Step 1: Add the helper function**
+
+```python
+def _foreground_subprocess_window(pid: int, attempts: int = 3, delay_s: float = 0.5) -> None:
+ """
+ Best-effort: bring the given subprocess's main OS window to the
+ foreground. No-op on non-Windows, when pywin32 is unavailable,
+ or when the window cannot be found (the subprocess may not have
+ created its window yet).
+
+ Args:
+  pid: the OS process ID of the subprocess whose window to raise.
+  attempts: max number of lookup attempts.
+  delay_s: seconds to wait between attempts.
+
+ Behavior:
+  - Windows: uses win32gui.EnumWindows to find a top-level window
+   whose owning thread/process matches `pid`, then calls
+   ShowWindow(hwnd, SW_SHOWNORMAL) + SetForegroundWindow(hwnd).
+  - Non-Windows: returns immediately.
+  - Any exception: caught at the function boundary, logged via
+   print(), and the function returns. NEVER raises into the
+   test fixture (per the user's resilient-fixture preference).
+
+ [C: tests/conftest.py:live_gui fixture]
+ """
+ if os.name != "nt":
+  return
+ try:
+  import win32gui
+  import win32con
+ except ImportError:
+  return
+ for _ in range(attempts):
+  try:
+   hwnd_found: list[int] = []
+
+   def _cb(hwnd: int, ctx: list[int]) -> bool:
+    if win32gui.IsWindowVisible(hwnd):
+     _, found_pid = win32gui.GetWindowThreadProcessId(hwnd)
+     if found_pid == pid:
+      ctx.append(hwnd)
+      return False
+    return True
+
+   win32gui.EnumWindows(_cb, hwnd_found)
+   if hwnd_found:
+    hwnd: int = hwnd_found[0]
+    win32gui.ShowWindow(hwnd, win32con.SW_SHOWNORMAL)
+    try:
+     win32gui.SetForegroundWindow(hwnd)
+    except Exception:
+     pass
+    return
+  except Exception as e:
+   print(f"[Fixture] WARNING: could not foreground sloppy.py window (pid={pid}): {e}")
+   return
+  time.sleep(delay_s)
+```
+
+- [ ] **Step 2: Run the test, verify it PASSES**
+
+Run: `uv run pytest tests/test_live_gui_foregrounding.py -v`
+Expected: 3/3 PASS.
+
+- [ ] **Step 3: Commit the helper**
+
+```powershell
+git add tests/conftest.py
+git commit -m "feat(fixture): add _foreground_subprocess_window helper for live_gui"
+```
+
+### Task 2.3: Wire the helper into the `live_gui` fixture
+
+**Files:** Modify `tests/conftest.py` (the `live_gui` fixture's `subprocess.Popen(...)` call site).
+
+- [ ] **Step 1: Locate the `subprocess.Popen(...)` call inside `live_gui`**
+
+Use `manual-slop_get_file_slice` or `manual-slop_py_get_definition` to find the exact line. The Popen call returns a `proc` object whose `.pid` attribute is what the helper needs.
+
+- [ ] **Step 2: Add the helper call immediately after the Popen returns**
+
+Insert one line right after the Popen block (after `proc` is assigned, before any subsequent `wait` / `health` check):
+
+```python
+_foreground_subprocess_window(proc.pid)
+```
+
+Anchor the edit on a unique surrounding context (e.g. the line right after Popen completes — typically a `print` line about spawning, or a `health check` call). Use `manual-slop_edit_file` with the exact `old_string`/`new_string`.
+
+- [ ] **Step 3: Verify the fixture still parses**
+
+Run: `uv run python -c "import ast; ast.parse(open('tests/conftest.py').read())"`
+Expected: no errors.
+
+- [ ] **Step 4: Run a single live_gui test to confirm the fixture still works**
+
+Run: `uv run pytest tests/test_hooks.py -v`
+Expected: passes. The `[Fixture]` log line may or may not appear depending on whether pywin32 is available and the subprocess window is findable; both are acceptable.
+
+- [ ] **Step 5: Commit the wiring**
+
+```powershell
+git add tests/conftest.py
+git commit -m "feat(fixture): foreground sloppy.py window in live_gui fixture"
+```
+
+### Task 2.4: Conductor — User Manual Verification (Phase 2)
+
+- [ ] **Step 1: Run the foregrounding unit tests**
+
+  Run: `uv run pytest tests/test_live_gui_foregrounding.py -v`
+  Expected: 3/3 PASS.
+
+- [ ] **Step 2: Run a small live_gui test to confirm the fixture still works**
+
+  Run: `uv run pytest tests/test_hooks.py -v`
+  Expected: passes.
+
+- [ ] **Step 3: PAUSE and present verification result**
+
+  > "Phase 2 verification: 3/3 unit tests pass; live_gui fixture still spawns successfully. Ready to commit Phase 2 checkpoint and move to Phase 3? (yes / changes needed)"
+
+- [ ] **Step 4: Create the Phase 2 checkpoint**
+
+  Capture the most recent commit hash. Attach a git note. Update `plan.md` Phase 2 status to `[x]` and append the hash.
+
+---
+
+## Phase 3: `focus_test_panel` Helper + Per-Test Wiring
+
+Focus: A new `focus_test_panel(name)` helper in `tests/conftest.py` using the existing `ApiHookClient.set_value`. Wire into 3 starter `*_sim.py` tests.
+
+**Files:**
+- Modify: `tests/conftest.py` (add `focus_test_panel` helper)
+- Modify: 3 `tests/test_*_sim.py` files (one-line addition each)
+
+### Task 3.1: Add the `focus_test_panel` helper
+
+**Files:** Modify `tests/conftest.py` (insert after `_foreground_subprocess_window`).
+
+- [ ] **Step 1: Add the helper function**
+
+```python
+def focus_test_panel(panel_name: str, host: str = "127.0.0.1", port: int = 8999) -> bool:
+ """
+ For live_gui tests: assert the named panel is visible so the user
+ watching the GUI subprocess can see the test's target panel.
+
+ Uses the existing ApiHookClient (no new IPC endpoints). The
+ set_value call toggles `show_windows["<name>"] = True` via the
+ Hook API.
+
+ Returns True on success, False if the hook server is not
+ reachable (e.g. called outside a live_gui session; the test
+ may choose to skip subsequent assertions on False).
+
+ [C: tests/test_*_sim.py — call before assertions]
+ """
+ try:
+  from src.api_hook_client import ApiHookClient
+ except ImportError:
+  return False
+ try:
+  client = ApiHookClient(host=host, port=port)
+  if not client.wait_for_server(timeout=0.5):
+   return False
+  client.set_value(f'show_windows["{panel_name}"]', True)
+  return True
+ except Exception as e:
+  print(f"[focus_test_panel] could not focus '{panel_name}': {e}")
+  return False
+```
+
+- [ ] **Step 2: Verify the helper imports cleanly**
+
+Run: `uv run python -c "import tests.conftest; print(hasattr(tests.conftest, 'focus_test_panel'))"`
+Expected: prints `True`.
+
+- [ ] **Step 3: Commit the helper**
+
+```powershell
+git add tests/conftest.py
+git commit -m "feat(fixture): add focus_test_panel helper for live_gui test panels"
+```
+
+### Task 3.2: Wire `focus_test_panel` into 3 starter sim tests
+
+**Files:** Modify 3 `tests/test_*_sim.py` files.
+
+- [ ] **Step 1: Add to `tests/test_command_palette_sim.py`**
+
+  Find the test that uses the Command Palette (typically the only `def test_*(live_gui):` function). Add as the FIRST line after `client.wait_for_server(...)`:
+
+  ```python
+  focus_test_panel("Command Palette")
+  ```
+
+- [ ] **Step 2: Add to `tests/test_workflow_sim.py`**
+
+  Find the test that drives the Discussion Hub. Add:
+
+  ```python
+  focus_test_panel("Discussion Hub")
+  ```
+
+- [ ] **Step 3: Add to `tests/test_undo_redo_sim.py`**
+
+  Find the test that exercises Undo/Redo. Add:
+
+  ```python
+  focus_test_panel("Discussion Hub")
+  ```
+
+- [ ] **Step 4: Verify each file parses**
+
+For each:
+```powershell
+uv run python -c "import ast; ast.parse(open('tests/test_command_palette_sim.py').read())"
+uv run python -c "import ast; ast.parse(open('tests/test_workflow_sim.py').read())"
+uv run python -c "import ast; ast.parse(open('tests/test_undo_redo_sim.py').read())"
+```
+Expected: no errors.
+
+- [ ] **Step 5: Run one of the modified sims to confirm the fixture still works**
+
+Run: `uv run pytest tests/test_command_palette_sim.py -v`
+Expected: passes. The new `focus_test_panel("Command Palette")` call is idempotent for an already-visible panel.
+
+- [ ] **Step 6: Commit the wiring**
+
+```powershell
+git add tests/test_command_palette_sim.py tests/test_workflow_sim.py tests/test_undo_redo_sim.py
+git commit -m "test(sim): add focus_test_panel calls to 3 starter live_gui sims"
+```
+
+### Task 3.3: Conductor — User Manual Verification (Phase 3)
+
+- [ ] **Step 1: Run the 3 modified sim tests**
+
+  Run: `uv run pytest tests/test_command_palette_sim.py tests/test_workflow_sim.py tests/test_undo_redo_sim.py -v`
+  Expected: all pass.
+
+- [ ] **Step 2: PAUSE and present verification result**
+
+  > "Phase 3 verification: 3 sim tests pass with focus_test_panel calls. The helper is exported and idempotent. Ready to commit Phase 3 checkpoint and move to Phase 4? (yes / changes needed)"
+
+- [ ] **Step 3: Create the Phase 3 checkpoint**
+
+  Capture the most recent commit hash. Attach a git note. Update `plan.md` Phase 3 status to `[x]` and append the hash.
+
+---
+
+## Phase 4: `tests/artifacts/` Scratch Cleanup
+
+Focus: Verify the candidate scratch files have NO references in the codebase, then delete them. Single atomic commit.
+
+**Files:** Delete only; no modifications.
+
+### Task 4.1: Verify and delete scratch files
+
+- [ ] **Step 1: Build the candidate list and verify each is unreferenced**
+
+  The candidate list (per spec §4.4 FR-19):
+  - `test_parser.py`, `test_patterns.py`, `test_regex.py`
+  - `verify_layout.py`, `check_cwd.py`, `check_cwd_uv.py`, `exists.py`, `fix_stale_names.py`, `fix_conftest_layout.py`
+  - `fake_test_output.txt`
+  - `agents_skip_msg.txt`, `commit_layout_diag_msg.txt`, `configpath_msg.txt`, `context_presets_msg.txt`, `hooks_dictkey_msg.txt`, `reset_layout_msg.txt`, `st2a_prompt.txt`, `st2a_task.toml`, `st2g_msg.txt`, `st2g_msg2.txt`, `st2g_msg3.txt`, `stale_test_msg.txt`, `synthesis_crash_msg.txt`, `warmup_fix_msg.txt`, `workflow_skip_msg.txt`
+  - `task1.toml`, `task1.txt`, `task2.toml`, `task2_1.txt`, `task3.toml`, `task3_1.txt`, `task4.toml`, `task_1_1.txt`
+  - `temp_config.toml`, `temp_data.txt`, `temp_liveaisettingssim.toml`, `temp_livecontextsim.toml`, `temp_liveexecutionsim.toml`, `temp_livetoolssim.toml`, `temp_notes.txt`, `temp_project.toml`, `temp_settings.toml`, `temp_simproject.toml`
+  - `test_001.md`
+
+  For each candidate, run a grep across `tests/`, `scripts/`, `src/`, `docs/`:
+  ```powershell
+  rg "<filename>" tests/ scripts/ src/ docs/
+  ```
+  Expected: zero matches. If any match is found, PRESERVE that file (do NOT delete) and note in the commit message.
+
+  Also confirm each file is gitignored (or untracked):
+  ```powershell
+  git check-ignore -v tests/artifacts/test_parser.py
+  ```
+  Expected: prints a `.gitignore` rule for each. If any file is TRACKED, do NOT delete it without explicit user permission (HARD BAN on `git restore`/`git checkout --`).
+
+- [ ] **Step 2: Delete the verified files**
+
+  Use a single PowerShell command:
+  ```powershell
+  Remove-Item tests/artifacts/test_parser.py, tests/artifacts/test_patterns.py, tests/artifacts/test_regex.py, tests/artifacts/verify_layout.py, tests/artifacts/fake_test_output.txt, tests/artifacts/check_cwd.py, tests/artifacts/check_cwd_uv.py, tests/artifacts/exists.py, tests/artifacts/fix_stale_names.py, tests/artifacts/fix_conftest_layout.py, tests/artifacts/agents_skip_msg.txt, tests/artifacts/commit_layout_diag_msg.txt, tests/artifacts/configpath_msg.txt, tests/artifacts/context_presets_msg.txt, tests/artifacts/hooks_dictkey_msg.txt, tests/artifacts/reset_layout_msg.txt, tests/artifacts/st2a_prompt.txt, tests/artifacts/st2a_task.toml, tests/artifacts/st2g_msg.txt, tests/artifacts/st2g_msg2.txt, tests/artifacts/st2g_msg3.txt, tests/artifacts/stale_test_msg.txt, tests/artifacts/synthesis_crash_msg.txt, tests/artifacts/task1.toml, tests/artifacts/task1.txt, tests/artifacts/task2.toml, tests/artifacts/task2_1.txt, tests/artifacts/task3.toml, tests/artifacts/task3_1.txt, tests/artifacts/task4.toml, tests/artifacts/temp_config.toml, tests/artifacts/temp_data.txt, tests/artifacts/temp_liveaisettingssim.toml, tests/artifacts/temp_livecontextsim.toml, tests/artifacts/temp_liveexecutionsim.toml, tests/artifacts/temp_livetoolssim.toml, tests/artifacts/temp_notes.txt, tests/artifacts/temp_project.toml, tests/artifacts/temp_settings.toml, tests/artifacts/temp_simproject.toml, tests/artifacts/test_001.md, tests/artifacts/warmup_fix_msg.txt, tests/artifacts/workflow_skip_msg.txt, tests/artifacts/task_1_1.txt
+  ```
+
+  If `Remove-Item` fails because a file doesn't exist (already deleted or never existed), it's a no-op — that's fine.
+
+- [ ] **Step 3: Verify the directory still has the preserved files**
+
+  ```powershell
+  Get-ChildItem tests/artifacts
+  ```
+  Expected: only the preserved entries (`.gitignore`, `manualslop_layout_default.ini`, runtime state directories, referenced TOML files). No scratch files.
+
+- [ ] **Step 4: Commit the cleanup**
+
+  ```powershell
+  git add -A tests/artifacts
+  git status   # confirm no tracked files inside tests/artifacts were deleted
+  git commit -m "chore(artifacts): remove ~45 scratch files from tests/artifacts/"
+  ```
+
+  If the commit shows 0 changed files (everything was gitignored and deletion doesn't affect git), that's acceptable — the deletion is recorded in the working tree, not the git history.
+
+### Task 4.2: Conductor — User Manual Verification (Phase 4)
+
+- [ ] **Step 1: PAUSE and present the cleanup result**
+
+  > "Phase 4 complete. tests/artifacts/ now contains only the preserved files. Listing: <list>. Ready to commit Phase 4 checkpoint and finalize? (yes / changes needed)"
+
+- [ ] **Step 2: Create the Phase 4 checkpoint**
+
+  Capture the most recent commit hash (or note that the commit was empty). Attach a git note. Update `plan.md` Phase 4 status to `[x]` and append the hash (or "no SHA; gitignored delete" if no commit SHA).
+
+---
+
+## Phase 5: Track Finalization (Verification + Status Update)
+
+Focus: Re-run the full test suite (5 batches, 298 files) to confirm no regressions. Update `conductor/tracks.md`. Commit the plan update.
+
+### Task 5.1: Full suite regression run
+
+- [ ] **Step 1: Run the full test suite via the new orchestrator (or legacy, whichever is current default)**
+
+  If the refactor's Phase 3 is shipped, run:
+  ```powershell
+  uv run python scripts/run_tests_batched.py --tiers 1,2,3
+  ```
+  Otherwise, run the legacy:
+  ```powershell
+  uv run python scripts/run_tests_batched.py --batch-size 64
+  ```
+
+  Expected: all batches 1-4 pass; batch 5 (or tier 3 for the new orchestrator) may have failures. The per-file failure list now shows the actual files.
+
+- [ ] **Step 2: PAUSE and present the regression result**
+
+  > "Phase 5 verification: full suite run; per-file failure list verified. No regressions in batches 1-4. The track's verification criteria are all met. Ready to mark the track complete? (yes / changes needed)"
+
+### Task 5.2: Update `conductor/tracks.md`
+
+- [ ] **Step 1: Add a "Phase 9" chore-track entry for this track**
+
+  Format (mirroring existing entries):
+
+  ```markdown
+  - [x] **Track: Test Batching — Post-Refactor Polish** `[checkpoint: <sha>]`
+     *Link: [./tracks/test_batching_post_refactor_polish_20260607/](./tracks/test_batching_post_refactor_polish_20260607/), Spec: [./tracks/test_batching_post_refactor_polish_20260607/spec.md](./tracks/test_batching_post_refactor_polish_20260607/spec.md), Plan: [./tracks/test_batching_post_refactor_polish_20260607/plan.md](./tracks/test_batching_post_refactor_polish_20260607/plan.md)*
+     *Goal: After test_batching_refactor_20260606 ships, lift _extract_failed_files to scripts/test_failure_parser.py (shared by legacy and new orchestrator); wire per-file failure list into the new orchestrator's SUMMARY; add _foreground_subprocess_window + focus_test_panel helpers to live_gui fixture; clean up ~45 scratch files in tests/artifacts/. No new dependencies; no regex.*
+  ```
+
+- [ ] **Step 2: Commit the tracks.md update**
+
+  ```powershell
+  git add conductor/tracks.md
+  git commit -m "conductor(tracks): mark test_batching_post_refactor_polish_20260607 as complete"
+  ```
+
+### Task 5.3: Final archive (optional)
+
+- [ ] **Step 1: Ask the user whether to archive**
+
+  > "Track complete. Archive to `conductor/tracks/archive/` now, or leave in `tracks/`? (archive / leave)"
+
+- [ ] **Step 2: If archive chosen**
+
+  ```powershell
+  git mv conductor/tracks/test_batching_post_refactor_polish_20260607 conductor/tracks/archive/
+  git commit -m "conductor(archive): archive test_batching_post_refactor_polish_20260607"
+  ```
+
+- [ ] **Step 3: Announce completion**
+
+  > "Track `test_batching_post_refactor_polish_20260607` is complete. The refactor is now followed by observability + parser polish."
@@ -0,0 +1,235 @@
+# Track Specification: Test Batching — Post-Refactor Polish
+
+**Status:** Active (spec authored 2026-06-08)
+**Initialized:** 2026-06-08
+**Owner:** Tier 2 Tech Lead
+**Priority:** Medium (developer ergonomics + observability; not a regression blocker)
+**Blocked by:** `test_batching_refactor_20260606` (must be SHIPPED before this track begins; the new orchestrator from the refactor is the target of the polish)
+**Blocks:** None
+
+---
+
+## 1. Problem Statement
+
+`test_batching_refactor_20260606` will replace the current `scripts/run_tests_batched.py` with a tier-based orchestrator that:
+- Uses `subprocess.run(cmd, capture_output=True, text=True)` to invoke each batch's pytest
+- On failure, prints the last 2000 chars of stdout (the new spec/plan, Phase 3 Task 3.1, line 1304: `print(proc.stdout[-2000:] if proc.returncode != 0 else ...)`)
+- Has no mechanism to surface the **actual failed file paths** to the user
+
+This is a regression in failure visibility vs. the current script (which lists every file in a failed batch — bad, but at least explicit). The new script will print a tail of pytest output that the user must manually scan for `FAILED ` lines.
+
+Three concrete improvements are deferred from the refactor to this track:
+
+1. **Per-file FAILED-line extraction** in the new orchestrator. When a tier batch fails, the script's summary should list the specific test files pytest reported as failed (parsed via str ops only, no regex per `AGENTS.md` standing ban). Same contract the current legacy script's `_extract_failed_files` (when fixed) will provide.
+2. **`live_gui` subprocess window foregrounding.** When the `live_gui` fixture spawns `sloppy.py`, the OS window must be raised to the foreground so the user watching the test can see the activity. Tier 3 (consolidated `live_gui`, 14+ `*_sim.py` files in one pytest invocation) amplifies this: without foregrounding, the user sees a hidden window for 30-60s while the tier runs.
+3. **`focus_test_panel(name)` test helper.** Live_gui tests should signal which panel they're exercising. The helper uses the existing `ApiHookClient.set_value` to toggle `show_windows[name] = True` and is called from individual `*_sim.py` test setup. The refactor's Tier 3 consolidation makes this signal-critical: the user needs to see WHICH panel is being driven, not just that something is happening.
+
+A fourth improvement is housekeeping: ~45 scratch files in `tests/artifacts/` from prior sessions (regex experimentation, layout baking debugging, sub-track task notes). These are gitignored but clutter the directory. Safe deletion is non-trivial (some files may be referenced by other tests or fixtures) so it's deferred to this track where it can be done carefully with verification.
+
+---
+
+## 2. Current State Audit (as of `2db14361 TEST LAYOUT`)
+
+### Already Implemented (DO NOT re-implement)
+
+| What | Where | Status |
+|---|---|---|
+| `App._diag_layout_state()` method | `src/gui_2.py:507-544` | Committed `818537b3`. Logs `[GUI] show_windows entries: N`, `[GUI] layout file: <path> (<bytes>)`, `[GUI] WARNING: layout has N stale window name(s)...` |
+| `manualslop_layout_default.ini` (user's preferred 2-column layout) | `tests/artifacts/manualslop_layout_default.ini` (2,699 bytes) | Whitelisted in `.gitignore` line 17. Confirmed loaded by `_diag_layout_state` log. |
+| `tests/conftest.py:418-421` copies the layout artifact into the test workspace | `tests/conftest.py:418-421` | Replaces the prior "do NOT copy" block from `7a4f71e7` |
+| `_default_windows` updated for 12-window visible-by-default set | `src/app_controller.py:1832-1855` | MMA Dashboard=False, Log Management=True, Diagnostics=True |
+| `_STALE_WINDOW_NAMES` set | `src/gui_2.py:530-533` | 10 names (Theme removed; was incorrectly flagged as stale) |
+| Skip markers from `e09e6823` resolved | `8d58d7fc` (warmup races), `a36aad50` (gui_events_v2), `91b34ae8` (live_gui_filedialog), `ff523f7e` (project_switch_persona) | 3 of 5 fixed in subsequent commits; 2 in `8d58d7fc` |
+| `RUN_MMA_INTEGRATION` env-var gate on `test_mma_step_mode_sim.py` | `tests/test_mma_step_mode_sim.py:24-27` | Appropriate opt-in integration gate, not a broken test |
+| `scripts/cleanup_orphaned_processes.py` | Committed `5e1867bb` | Manages stale subprocesses; preserves MCP servers |
+| `_extract_failed_files` (in legacy `run_tests_batched.py`, if Phase 0 ships) | `scripts/run_tests_batched.py:30-50` (post-Phase-0) | Str-ops-only FAILED-line parser; 11 unit tests in `tests/test_run_tests_batched.py` |
+
+### Gaps to Fill (This Track's Scope)
+
+| Gap | Severity | Where the fix lands |
+|---|---|---|
+| New orchestrator's `subprocess.run(capture_output=True)` only prints stdout tail on failure — no per-file failure list | **High** | New `scripts/run_tests_batched.py` (post-refactor) — the `_run_batch` helper around line 1296-1308 of the refactor's plan |
+| `live_gui` fixture doesn't bring sloppy.py's window to front | **Medium** | `tests/conftest.py:live_gui` fixture |
+| `live_gui` tests have no per-test focus signal | **Medium** | `tests/conftest.py` (new helper) + per-test callsites in 14+ `*_sim.py` files |
+| `tests/artifacts/` has ~45 scratch files from prior sessions | **Low** | `tests/artifacts/*.py`, `tests/artifacts/*.txt`, `tests/artifacts/*.toml` (verify references first) |
+| The `_extract_failed_files` from Phase 0 of the refactor (if shipped) lives in the LEGACY script that gets renamed to `.legacy` in Phase 3, then deleted in Phase 4 | **Critical** | The function needs to be lifted to a shared location (e.g., `scripts/test_failure_parser.py`) so both legacy and new orchestrator use the same code |
+
+---
+
+## 3. Goals
+
+1. **Per-file FAILED-line extraction in the new orchestrator.** When any tier batch fails, the summary lists the specific test files pytest reported as failed (via str ops only, no regex). On timeout, fall back to listing the whole batch with `(timeout)` annotation.
+2. **Lift `_extract_failed_files` to a shared library.** The function lives in `scripts/test_failure_parser.py` (or similar); both the legacy script and the new orchestrator import it. No code duplication.
+3. **`live_gui` subprocess window foregrounding.** When the fixture spawns `sloppy.py`, find the child window by PID and call `ShowWindow` + `SetForegroundWindow`. No-op on non-Windows or when pywin32 is unavailable. Wrapped in `try/except`; never raises.
+4. **`focus_test_panel(name)` helper.** New module-level function in `tests/conftest.py` that uses the existing `ApiHookClient.set_value` to toggle `show_windows[name] = True`. Returns True/False (False if hook server unreachable).
+5. **Wire `focus_test_panel` into at least 3 starter `*_sim.py` tests** so the pattern is established for the refactor's consolidated Tier 3.
+6. **Clean up `tests/artifacts/` scratch files** (with verification of non-reference first).
+
+---
+
+## 4. Functional Requirements
+
+### 4.1 Shared `_extract_failed_files` library
+
+**FR-1.** Create `scripts/test_failure_parser.py` containing the `_extract_failed_files(output: str) -> list[str]` function. Str-ops-only (no `re` import per `AGENTS.md`).
+
+**FR-2.** The function SHALL:
+- Accept the full captured stdout+stderr from a pytest invocation
+- Parse lines beginning with the literal 7-character prefix `FAILED ` (note trailing space)
+- Extract the test ID, ending at the first ` - ` (space-dash-space) separator
+- If the test ID contains `::`, take the file path portion (before the first `::`)
+- Normalize backslashes to forward slashes (Windows path safety)
+- Strip a leading `tests/` prefix to return the bare filename
+- Deduplicate (preserve first-occurrence order)
+
+**FR-3.** Update the legacy `scripts/run_tests_batched.py` to import `_extract_failed_files` from the new shared module (if it was implemented locally in the refactor's Phase 0; otherwise add it there for the first time).
+
+**FR-4.** Update the new orchestrator (post-refactor) to call `_extract_failed_files` on the captured stdout/stderr in `_run_batch` when `returncode != 0`. Use the returned list to populate the SUMMARY table's per-file failure list.
+
+**FR-5.** Add 11+ unit tests in `tests/test_test_failure_parser.py` covering the contract from FR-2 (same set as the original 11 tests for the legacy script, ported to the new module).
+
+### 4.2 New Orchestrator Per-File Failure List
+
+**FR-6.** In the new `scripts/run_tests_batched.py:_run_batch` (post-refactor), on non-zero exit:
+- Call `_extract_failed_files(proc.stdout + proc.stderr)` (combined)
+- If the returned list is non-empty, add those files to the per-tier failure list
+- If the returned list is empty (rare; collection errors, plugin crashes), add the whole batch's files with a `(no FAILED lines; treating as batch failure)` annotation
+
+**FR-7.** On `subprocess.TimeoutExpired` (the batch exceeded `--timeout`): fall back to `failed_files.extend(batch)` with `(timeout)` annotation (per-file accuracy impossible on timeout — same as legacy).
+
+**FR-8.** The SUMMARY table (new orchestrator's `_print_summary`) SHALL include a per-file failure listing when any tier failed:
+```
+[TIER 3] live_gui              FAIL   14/14  47.2s
+   - tests/test_foo.py
+   - tests/test_bar.py
+```
+
+**FR-9.** The orchestrator's worst-case exit code SHALL be 1 if any tier has a per-file failure list, 0 if all tiers passed or were skipped.
+
+### 4.3 Live_Gui Window Foregrounding (`tests/conftest.py`)
+
+**FR-10.** Add module-level function `_foreground_subprocess_window(pid: int, attempts: int = 3, delay_s: float = 0.5) -> None` to `tests/conftest.py`.
+
+**FR-11.** The function SHALL:
+- No-op immediately on `os.name != "nt"`
+- Try-except `import win32gui, win32con`; no-op on `ImportError`
+- Loop `attempts` times: `win32gui.EnumWindows` to find a top-level visible window whose owning PID matches `pid`; on match, call `win32gui.ShowWindow(hwnd, win32con.SW_SHOWNORMAL)` then `win32gui.SetForegroundWindow(hwnd)`
+- Sleep `delay_s` between attempts (the subprocess may take 1-2s to create its window)
+- Wrap the whole body in `try/except Exception`; log a `[Fixture] WARNING: ...` line and return on any error; NEVER raise into the test fixture
+
+**FR-12.** Wire the helper into the `live_gui` fixture: insert one line `_foreground_subprocess_window(proc.pid)` immediately after the `subprocess.Popen(...)` call returns.
+
+**FR-13.** Add 3 unit tests in `tests/test_live_gui_foregrounding.py` asserting: helper exists and is callable; helper is no-op on invalid PIDs; helper is no-op when `win32gui`/`win32con` import fails (monkeypatched).
+
+### 4.4 `focus_test_panel` Helper
+
+**FR-14.** Add module-level function `focus_test_panel(panel_name: str, host: str = "127.0.0.1", port: int = 8999) -> bool` to `tests/conftest.py`.
+
+**FR-15.** The function SHALL:
+- Try-except `from src.api_hook_client import ApiHookClient`; return False on `ImportError`
+- Instantiate `ApiHookClient(host=host, port=port)`
+- Call `client.wait_for_server(timeout=0.5)`; return False if the server is not reachable
+- Call `client.set_value(f'show_windows["{panel_name}"]', True)`
+- Wrap the whole body in `try/except Exception`; log a `[focus_test_panel] ...` line and return False on any error
+- Return True on success
+
+**FR-16.** The function is OPTIONAL for tests: tests that don't call it get existing behavior. Tests that call it signal intent. The function's return value is informational (caller may choose to skip on False).
+
+**FR-17.** Wire `focus_test_panel` into at least 3 starter `*_sim.py` files (one-line addition in test setup, immediately after `client.wait_for_server(...)`):
+- `tests/test_command_palette_sim.py`: `focus_test_panel("Command Palette")`
+- `tests/test_workflow_sim.py`: `focus_test_panel("Discussion Hub")`
+- `tests/test_undo_redo_sim.py`: `focus_test_panel("Discussion Hub")`
+
+### 4.5 `tests/artifacts/` Scratch Cleanup
+
+**FR-18.** Verify each candidate scratch file is NOT referenced by any test or fixture (use `rg "<filename_without_ext>" tests/ scripts/ src/ docs/` and confirm zero matches).
+
+**FR-19.** For files with zero references, delete them. The candidate list (from prior session's report + my own audit of `tests/artifacts/`):
+- `test_parser.py`, `test_patterns.py`, `test_regex.py` (regex experimentation)
+- `verify_layout.py`, `check_cwd.py`, `check_cwd_uv.py`, `exists.py`, `fix_stale_names.py`, `fix_conftest_layout.py` (layout + cwd debugging)
+- `fake_test_output.txt` (sample data for parser testing)
+- `agents_skip_msg.txt`, `commit_layout_diag_msg.txt`, `configpath_msg.txt`, `context_presets_msg.txt`, `hooks_dictkey_msg.txt`, `reset_layout_msg.txt`, `st2a_prompt.txt`, `st2a_task.toml`, `st2g_msg.txt` (3 copies), `stale_test_msg.txt`, `synthesis_crash_msg.txt`, `warmup_fix_msg.txt`, `workflow_skip_msg.txt` (agent scratch messages)
+- `task1.toml`–`task4.toml`, `task1.txt`–`task_3_1.txt` (task notes)
+- `temp_config.toml`, `temp_data.txt`, `temp_live*.toml`, `temp_notes.txt`, `temp_project.toml`, `temp_settings.toml`, `temp_simproject.toml` (temp scratch)
+- `test_001.md` (25KB scratch markdown)
+
+**FR-20.** The following SHALL be PRESERVED:
+- `tests/artifacts/manualslop_layout_default.ini` (whitelisted in `.gitignore`)
+- `tests/artifacts/manual_slop.toml`, `repro_project.toml`, `test_snapshot_project.toml` (referenced by fixtures)
+- `tests/artifacts/live_gui_workspace/`, `repro_workspace/`, `temp_workspace/`, `gui_ux_sim/`, `test_isolated_project/`, `test_link_workspace/`, `conductor/`, `.slop_cache/` (runtime state)
+- `tests/artifacts/.gitignore` (in-place gitignore for the subdirectory)
+
+---
+
+## 5. Non-Functional Requirements
+
+**NFR-1.** 1-space indentation throughout all Python changes (per `conductor/product-guidelines.md`).
+**NFR-2.** CRLF line endings on Windows for all changed `.py` files.
+**NFR-3.** No inline comments in production code (per `AGENTS.md`).
+**NFR-4.** No `re` (regex) module imports in the failure parser. Verify with `grep -n "import re\|from re" scripts/test_failure_parser.py` returning empty after the change.
+**NFR-5.** No new external dependencies. No `pyproject.toml` change.
+**NFR-6.** Type hints required for all new functions and the modified `run_batch` signature in the new orchestrator.
+**NFR-7.** The window-foregrounding helper SHALL NOT call `SetForegroundWindow` more than 3 times per session (Windows throttles repeated foreground-stealing attempts).
+**NFR-8.** All commits are atomic per-task (per `conductor/workflow.md` "Definition of Done").
+
+---
+
+## 6. Architecture Reference
+
+- **`docs/guide_architecture.md` "Thread domains"** — the live_gui fixture runs in the pytest process (foreground); sloppy.py runs in a subprocess. The fixture → subprocess communication is over the Hook API (`127.0.0.1:8999`). Window-foregrounding uses a separate channel (Windows OS API; `win32gui`).
+- **`docs/guide_testing.md` "live_gui fixture"** — the session-scoped fixture's lifecycle.
+- **`docs/guide_api_hooks.md` "ApiHookClient.set_value"** — the existing mechanism for toggling `show_windows[name]`. The new `focus_test_panel` helper uses this.
+- **`docs/guide_simulations.md` "Puppeteer pattern"** — existing pattern for live_gui tests; the new `focus_test_panel` is a small variant of the same shape.
+- **`conductor/tracks/test_batching_refactor_20260606/spec.md` §3.3 "Six Tiers"** — Tier 3 (live_gui) is the upstream system this track polishes. The new orchestrator's `_run_batch` is the integration point for the per-file failure list.
+- **`conductor/tracks/startup_speedup_20260606/state.toml` §`conftest_warmup_wait`** — the fixture's existing warmup-blocking wait runs at conftest load time, before the live_gui fixture executes. The new window-foregrounding code runs AFTER the subprocess spawns (not at load time) and is therefore orthogonal.
+- **`AGENTS.md` "Critical Anti-Patterns"** — re-affirms the standing ban on `re` (regex) module imports in the codebase. The user has threatened a 10-page report if they see regex.
+
+---
+
+## 7. Coordination with `test_batching_refactor_20260606`
+
+| Refactor phase | What this track does after it ships |
+|---|---|
+| **Phase 1** (Library + dry-run) | Nothing; legacy script unchanged. |
+| **Phase 2** (Shadow run) | Nothing; shadow run still uses legacy + new in parallel. |
+| **Phase 3** (Switch default, rename legacy to `.legacy`) | The legacy's `_extract_failed_files` (if implemented in refactor's Phase 0) is moved to `scripts/test_failure_parser.py` so the new orchestrator can use it without forking. The new orchestrator's `_run_batch` is updated to call the shared parser. |
+| **Phase 4** (Cleanup, delete legacy) | The legacy is deleted; `scripts/test_failure_parser.py` is the sole home of the FAILED-line parser. |
+
+### 7.1 Open question for the refactor (recorded, not fixed here)
+
+The refactor's `scripts/test_categorizer.py::auto_classify()` rule #2 uses **regex** in the spec (`AGENTS.md` ban conflict):
+> `\(live_gui\)\s*[:,)]` regex match in source
+
+The user has confirmed they will instruct the implementing agent to convert this to AST-based detection (`ast.parse` → walk `FunctionDef` for `live_gui` in args). This is **the refactor's responsibility**, not this post-refactor track's.
+
+---
+
+## 8. Out of Scope
+
+- **The test batching refactor itself** — owned by `test_batching_refactor_20260606`.
+- **Auto-classification regex → AST conversion** — the user will instruct the agent directly; not part of this track.
+- **Tracked `manualslop_layout.ini` at repo root** — requires explicit user permission per the user's HARD BAN on `git restore`/`git checkout --`. The conftest no longer copies it to the test workspace (regression fixed in `7a4f71e7`).
+- **User's TOML files** (`config.toml`, `project.toml`, `project_history.toml`) — explicitly excluded per the user's standing constraint.
+- **New audit scripts** — none introduced. The existing audit set is sufficient.
+- **The skip markers from `e09e6823`** — 3 fixed in subsequent commits, 2 in `8d58d7fc`. No skip markers remain that this track needs to address.
+- **The `__getattr__` cheat audit work** — separate track referenced in `conductor/reports/AUDIT_ARCHITECTURAL_CHEATS_20260607.md`.
+- **Performance baseline** — the refactor's `--durations` feature records runtimes. Generating that file is a Phase 1 task of the refactor, not this track.
+
+---
+
+## 9. Verification Criteria
+
+This track is "done" when **all** of the following are true:
+
+- [ ] `scripts/test_failure_parser.py` exists and exports `_extract_failed_files` (no `re` import; verify with `grep -n "import re\|from re" scripts/test_failure_parser.py` returning empty).
+- [ ] 11+ unit tests in `tests/test_test_failure_parser.py` all pass.
+- [ ] The legacy `scripts/run_tests_batched.py` (if not yet deleted by the refactor) imports `_extract_failed_files` from the new module.
+- [ ] The new `scripts/run_tests_batched.py` (post-refactor) `_run_batch` calls `_extract_failed_files` on captured output and includes the per-file failure list in the SUMMARY table.
+- [ ] `tests/conftest.py:_foreground_subprocess_window` exists; 3 unit tests pass; the live_gui fixture calls it after `subprocess.Popen(...)`.
+- [ ] `tests/conftest.py:focus_test_panel` exists; 3+ `*_sim.py` tests call it in setup.
+- [ ] The scratch files from FR-19 are deleted; the directory only contains the preserved files/directories from FR-20.
+- [ ] The existing test suite still passes for batches 1-4 (no regressions).
+- [ ] Batch 5's timeout (test_z_negative_flows) is reported as exactly 1 failed file, not all 42.
+- [ ] All commits are atomic per-task with descriptive messages.
+- [ ] No commits include the user's TOML files.
+- [ ] No commits include `manualslop_layout.ini` at the repo root.
@@ -0,0 +1,84 @@
+# Track state for test_batching_post_refactor_polish_20260607
+# Updated by Tier 2 Tech Lead as tasks complete
+
+[meta]
+track_id = "test_batching_post_refactor_polish_20260607"
+name = "Test Batching - Post-Refactor Polish"
+status = "active"
+current_phase = 0
+last_updated = "2026-06-08"
+
+[blocked_by]
+# This track cannot begin Phase 1 until the refactor is SHIPPED.
+# Verify by checking conductor/tracks.md (status [x]) OR the refactor's
+# state.toml (current_phase = 4 AND last phase checkpoint_sha recorded).
+test_batching_refactor_20260606 = "not yet shipped"
+
+[phases]
+phase_1 = { status = "pending", checkpoint_sha = "", name = "Shared _extract_failed_files library" }
+phase_2 = { status = "pending", checkpoint_sha = "", name = "live_gui window foregrounding" }
+phase_3 = { status = "pending", checkpoint_sha = "", name = "focus_test_panel helper + per-test wiring" }
+phase_4 = { status = "pending", checkpoint_sha = "", name = "tests/artifacts/ scratch cleanup" }
+phase_5 = { status = "pending", checkpoint_sha = "", name = "Track finalization (regression run + tracks.md)" }
+
+[tasks]
+# Phase 1: Shared _extract_failed_files library
+t1_1 = { status = "pending", commit_sha = "", description = "Red: 11 unit tests in tests/test_test_failure_parser.py" }
+t1_2 = { status = "pending", commit_sha = "", description = "Green: implement scripts/test_failure_parser.py (no re import)" }
+t1_3 = { status = "pending", commit_sha = "", description = "Wire shared parser into post-refactor run_tests_batched.py:_run_batch + SUMMARY" }
+t1_4 = { status = "pending", commit_sha = "", description = "User verification: end-to-end run with deliberate failure shows per-file listing" }
+# Phase 2: live_gui window foregrounding
+t2_1 = { status = "pending", commit_sha = "", description = "Red: 3 unit tests in tests/test_live_gui_foregrounding.py" }
+t2_2 = { status = "pending", commit_sha = "", description = "Green: implement _foreground_subprocess_window in tests/conftest.py" }
+t2_3 = { status = "pending", commit_sha = "", description = "Wire _foreground_subprocess_window into the live_gui fixture" }
+t2_4 = { status = "pending", commit_sha = "", description = "User verification: live_gui test still passes; window helper is no-op-safe" }
+# Phase 3: focus_test_panel helper + per-test wiring
+t3_1 = { status = "pending", commit_sha = "", description = "Add focus_test_panel helper to tests/conftest.py" }
+t3_2 = { status = "pending", commit_sha = "", description = "Wire focus_test_panel into 3 starter sim tests (command_palette, workflow, undo_redo)" }
+t3_3 = { status = "pending", commit_sha = "", description = "User verification: 3 sim tests pass with focus_test_panel calls" }
+# Phase 4: tests/artifacts/ scratch cleanup
+t4_1 = { status = "pending", commit_sha = "", description = "Verify each candidate scratch file is unreferenced (rg across tests/scripts/src/docs)" }
+t4_2 = { status = "pending", commit_sha = "", description = "Delete ~45 scratch files; preserve the 8 in-use entries from FR-20" }
+t4_3 = { status = "pending", commit_sha = "", description = "User verification: directory listing shows only preserved entries" }
+# Phase 5: Track finalization
+t5_1 = { status = "pending", commit_sha = "", description = "Full suite regression run via new orchestrator (or legacy if refactor not yet switched)" }
+t5_2 = { status = "pending", commit_sha = "", description = "Update conductor/tracks.md with the completed entry" }
+t5_3 = { status = "pending", commit_sha = "", description = "Archive to conductor/tracks/archive/ (optional; ask user)" }
+
+[verification]
+# Filled as phases complete. The metadata.json's verification_criteria is the source of truth.
+shared_parser_module_exists = false
+shared_parser_unit_tests_pass = false
+shared_parser_no_re_import = false
+orchestrator_per_file_failure_list = false
+foreground_helper_exists = false
+foreground_unit_tests_pass = false
+foreground_wired_into_fixture = false
+focus_test_panel_exists = false
+focus_test_panel_wired_into_3plus_sims = false
+scratch_files_deleted = false
+preserved_files_preserved = false
+full_suite_no_regressions = false
+per_file_accuracy_in_batch5_timeout = false
+
+[blocker_verification]
+# Before starting Phase 1, verify:
+# 1. conductor/tracks.md shows test_batching_refactor_20260606 status [x]
+# 2. conductor/tracks/test_batching_refactor_20260606/state.toml shows current_phase = 4
+#    AND phase_4.checkpoint_sha is non-empty
+# If either check fails, STOP and report to the user. Do not proceed.
+refactor_track_shipped = false
+refactor_state_phase_4_checkpoint_present = false
+refactor_state_phase_4_checkpoint_sha = ""
+
+[files_audit]
+# Cross-reference of files this track touches
+scripts_test_failure_parser_py = { action = "create", notes = "shared FAILED-line parser; no re import" }
+tests_test_test_failure_parser_py = { action = "create", notes = "11 unit tests" }
+tests_test_live_gui_foregrounding_py = { action = "create", notes = "3 unit tests" }
+scripts_run_tests_batched_py = { action = "modify", notes = "wire shared parser into _run_batch + SUMMARY; add --timeout arg" }
+tests_conftest_py = { action = "modify", notes = "add _foreground_subprocess_window + focus_test_panel helpers" }
+tests_test_command_palette_sim_py = { action = "modify", notes = "one-line focus_test_panel call in setup" }
+tests_test_workflow_sim_py = { action = "modify", notes = "one-line focus_test_panel call in setup" }
+tests_test_undo_redo_sim_py = { action = "modify", notes = "one-line focus_test_panel call in setup" }
+tests_artifacts_scratch_files = { action = "delete", notes = "~45 files; verify no references first" }