conductor(track): test_infrastructure_hardening_20260609 - spec to kill the test regression nightmare

2026-06-09 15:15:26 -04:00
parent b4d240a9f3
commit 566cf08cb8
4 changed files with 1630 additions and 0 deletions
@@ -0,0 +1,78 @@
+{
+  "track_id": "test_infrastructure_hardening_20260609",
+  "name": "Test Infrastructure Hardening (2026-06-09)",
+  "created_at": "2026-06-09",
+  "status": "spec",
+  "priority": "A",
+  "blocked_by": [],
+  "blocks": [
+    "qwen_llama_grok_integration_20260606",
+    "data_oriented_error_handling_20260606",
+    "data_structure_strengthening_20260606",
+    "mcp_architecture_refactor_20260606",
+    "code_path_audit_20260607"
+  ],
+  "inherits_from": [
+    "docs/reports/test_infra_hardening_foundation_20260608.md",
+    "docs/reports/batch_resilience_plan_20260608.md",
+    "docs/reports/rag_test_batch_failure_status_20260609_pm3.md",
+    "docs/reports/rag_work_final_20260609_pm.md"
+  ],
+  "supersedes": [
+    "test_harness_hardening_20260310",
+    "test_patch_fixes_20260513",
+    "test_batching_post_refactor_polish_20260607",
+    "fix_remaining_tests_20260513",
+    "manual_ux_validation_20260608_PLACEHOLDER (per FR5 clean_baseline)",
+    "regression_fixes_20260605 (residual live_gui work)"
+  ],
+  "domain": "Meta-Tooling (test infrastructure; not the Application's GUI)",
+  "scope_summary": "Fix 3 root causes of test regression churn (subprocess state pollution, filesystem path hygiene, io_pool race) + 2 related bugs (set_value hook, optional clean-baseline) so the 4 upcoming tracks start from a clean test bed.",
+  "estimated_effort": "6.5 days (Phases 1-8)",
+  "phases": 8,
+  "verification_criteria": [
+    "FR1: Autouse _check_live_gui_health fixture in place; 3 tests in tests/test_live_gui_respawn.py pass",
+    "FR2: 6 test files no longer hardcode Path('tests/artifacts/live_gui_workspace'); live_gui_workspace fixture in place; 3 tests in tests/test_live_gui_workspace_fixture.py pass",
+    "FR3: _sync_rag_engine uses token + dirty flag; 3 tests in tests/test_sync_rag_engine_coalescing.py pass",
+    "FR4: set_value('ai_input', ...) actually mutates controller state; tests/test_gui2_set_value_hook_works.py passes in batch",
+    "FR5: clean_baseline marker in place; 2 tests in tests/test_clean_baseline_marker.py pass",
+    "FR6: docs/reports/test_bed_health_20260609.md written and committed with pass/fail counts",
+    "Audit: 4 audit files committed in conductor/tracks/test_infrastructure_hardening_20260609/audit/",
+    "Audit: scripts/check_test_toml_paths.py extended to flag hardcoded workspace paths",
+    "Docs: docs/guide_testing.md updated with new fixtures (FR1, FR2, FR5)",
+    "All tier-1 + tier-2 tests pass in batch (no regression)",
+    "At least 3 previously-failing tests now pass in batch (the RAG test, the set_value test, the RAG stress test)"
+  ],
+  "out_of_scope": [
+    "Per-file live_gui fixture scope (Solution A from batch_resilience_plan)",
+    "MMA pipeline tests that don't reach 'tracks' state (3 tests, separate code path)",
+    "Negative-flows tests (3 tests, separate code path)",
+    "test_auto_switch_sim (separate code path)",
+    "code_path_audit_20260607 (post-4-tracks)",
+    "chunkification_optimization_20260608_PLACEHOLDER (not yet approved)",
+    "CI infrastructure (no CI in repo)"
+  ],
+  "risks": [
+    {
+      "risk": "Per-test respawn adds >200ms per test (NFR1 violation)",
+      "mitigation": "Measure with the 49 tests in batch; if exceeded, fall back to per-batch respawn"
+    },
+    {
+      "risk": "tmp_path_factory refactor breaks on-disk chroma DB persistence",
+      "mitigation": "Clear .slop_cache/ dirs at session start; OR add a live_gui_workspace_persist opt-in"
+    },
+    {
+      "risk": "conftest.py corruption (previous attempt was reverted)",
+      "mitigation": "git stash before each edit; use manual-slop_set_file_slice; Tier 2 supervises"
+    },
+    {
+      "risk": "set_value fix changes behavior for existing tests that assert on the OLD broken behavior",
+      "mitigation": "Run full tier-3 batch in Phase 5 and verify no regressions"
+    }
+  ],
+  "tier_2_supervision_required_for": [
+    "Phase 1 (audit review)",
+    "Phase 3 (conftest refactor)",
+    "Phase 4 (io_pool race fix)"
+  ]
+}
@@ -0,0 +1,346 @@
+# Track Specification: Test Infrastructure Hardening (2026-06-09)
+
+> **Status:** SPEC FOR APPROVAL. The user has asked for a single track to "kill the test regression nightmare" so the 4 upcoming tracks (qwen_llama_grok, data_oriented_error_handling, data_structure_strengthening, mcp_architecture_refactor) can land on a clean test bed.
+>
+> **Inheritance:** This track absorbs and supersedes:
+> - `docs/reports/test_infra_hardening_foundation_20260608.md` (foundation, 5 phases proposed)
+> - `docs/reports/batch_resilience_plan_20260608.md` (4 solutions; Solution A + C recommended)
+> - `docs/reports/rag_test_batch_failure_status_20260609_pm3.md` (filesystem hygiene findings #1-5)
+> - `docs/reports/rag_work_final_20260609_pm.md` (remaining failures: io_pool race, set_value hook)
+> - The implicit "fix test in batch" goal that has been chasing the Tier 2 for 4+ days
+
+---
+
+## Overview
+
+The test suite has accumulated 49+ live_gui tests that share a single session-scoped subprocess. Recent regression hunts have surfaced 3 distinct failure modes that keep re-emerging under different masks:
+
+1. **Subprocess state pollution** — the 4 sims in `test_extended_sims.py` mutate controller state (`current_provider`, `ui_*` attrs, MMA workflows, RAG sync); subsequent tests in the same batch read dirty state.
+2. **Filesystem hygiene** — the `live_gui` fixture creates `tests/artifacts/live_gui_workspace/` as a HARDCODED relative path; 6 test files re-derive the path independently; `RAGEngine.index_file` joins `base_dir + file_path` with `base_dir` possibly being a relative path, so indexing silently no-ops in batch (the root cause of the RAG test batch failure).
+3. **io_pool race in `_sync_rag_engine`** — multiple setters in quick succession submit parallel sync tasks, last-finished-wins, indexing is non-deterministic.
+
+Each of these has been "fixed" in isolation (RAG dim-mismatch recursion, CWD fallback, embedding provider error surface, ini_content str/bytes sentinel, indent on `_capture_workspace_profile`) but the underlying architectural problems remain. The Tier 2 keeps finding new symptoms.
+
+**This track kills the nightmare by fixing the three root causes with surgical, contained, testable changes that the 4 upcoming tracks need as a precondition.**
+
+---
+
+## Current State Audit (as of 2026-06-09)
+
+### Already Implemented (DO NOT re-implement)
+
+- ✅ `live_gui` fixture exists at `tests/conftest.py:282` (session-scoped)
+- ✅ Fixture kills subprocess on teardown (`tests/conftest.py:516-547`)
+- ✅ `/api/gui_health` endpoint surfaces degraded state (commit `1c565da7`)
+- ✅ Pre-flight `get_gui_health()` check in `test_full_live_workflow` (commit `51ecace4`)
+- ✅ `try/except` around `immapp.run` (commit `1c565da7`)
+- ✅ `_UI_FLAG_DEFAULTS` allowlist for `__getattr__` (commit `bcdc26d0`)
+- ✅ `_ini_capture_ready` defer-not-catch flag for `imgui.save_ini_settings_to_memory` (commit `d7487af4`)
+- ✅ `_capture_workspace_profile` indent fix (sub-track 1 of `live_gui_test_hardening_v2`, commit `26e0ced4`)
+- ✅ `ini_content` str/bytes contract test (`tests/test_workspace_profile_serialization.py`)
+- ✅ `LogPruner` busy-loop backoff (commit `ac08ee87`)
+- ✅ RAG dim-mismatch wipe (commit `64bc04a6`)
+- ✅ RAG `_validate_collection_dim` recursion fix (commit `644d88ab`)
+- ✅ RAG `index_file` CWD fallback (commit `eb8357ec`, uncommitted as of report; needs to be committed as defensive fix)
+- ✅ `sentence-transformers` available in dev env via `[local-rag]` extra (commit `a341d7a7`)
+- ✅ `_sync_rag_engine` surfaces embedding_provider init failure (commit `e62266e8`)
+- ✅ `test_required_test_dependencies.py` enforces test-time deps (commit `b801b11c`)
+- ✅ `isolate_workspace`, `reset_paths`, `reset_ai_client`, `vlogger` autouse fixtures
+- ✅ `audit_main_thread_imports.py` and `audit_weak_types.py` static CI gates
+- ✅ `check_test_toml_paths.py` audit script (CI gate for real-TOML references)
+- ✅ Batch tier-1 + tier-2 + tier-3 + tier-H + tier-P structure (`scripts/run_tests_batched.py`)
+
+### Gaps to Fill (This Track's Scope)
+
+#### Gap 1: `live_gui` subprocess scope + per-test dirty-state guard
+- **What exists:** Session-scoped `live_gui` fixture. Subprocess state survives across 49+ tests.
+- **What's missing:** When a test dies (IM_ASSERT, error result, etc.) the subprocess is degraded; subsequent tests in different files get dirty state. The pre-flight `get_gui_health()` check is file-local, not test-local, and only checks health, doesn't recover.
+- **Real symptom:** `test_rag_phase4_final_verify` passes in isolation, fails in batch. `test_gui2_set_value_hook_works` returns `''` instead of queued value. `test_rag_phase4_stress` non-deterministic indexing.
+
+#### Gap 2: Filesystem hygiene for `live_gui_workspace`
+- **What exists:** `tests/conftest.py:412` hardcodes `Path("tests/artifacts/live_gui_workspace")`. 6 test files re-derive the same path independently.
+- **What's missing:** The path is relative to CWD. When the test runner or prior tests shift CWD, all downstream path joins break. `RAGEngine.index_file` joins `base_dir + file_path`; when `base_dir` is relative and CWD has drifted, the file doesn't exist, indexing silently no-ops.
+- **Real symptom:** RAG test in batch finds 0 documents in collection. `chroma_test_final_verify` count=0. `chroma_db` collection count=0. `chroma_test_stress` count=0. Only `chroma_manual_slop` (the user's project, NOT a test) has 328 docs from a separate session.
+- **Files affected:**
+  - `tests/conftest.py:412` (HARDCODED)
+  - `tests/test_rag_phase4_final_verify.py:20`
+  - `tests/test_rag_phase4_stress.py:21`
+  - `tests/test_saved_presets_sim.py:14, 121`
+  - `tests/test_tool_presets_sim.py:13`
+  - `tests/test_visual_sim_gui_ux.py:79`
+
+#### Gap 3: `_sync_rag_engine` io_pool race
+- **What exists:** `src/app_controller.py` `_sync_rag_engine` submits a sync task to `_io_pool` for each `set_value` that mutates `rag_config`. Multiple setters in quick succession → multiple parallel sync tasks → non-deterministic indexing.
+- **What's missing:** A coalescing/debounce pattern that serializes sync attempts within a short window (e.g., 100ms).
+- **Real symptom:** Test fires 5 setters (`rag_collection_name`, `files`, `rag_enabled`, `rag_source`, `rag_emb_provider`) in succession. Each submits a sync. The last one to *finish* wins, but indexing happens against whichever engine finished last. The test then asserts on the wrong engine's output.
+
+#### Gap 4: `set_value` hook test failure (pre-existing, separate code path)
+- **What exists:** `test_gui2_set_value_hook_works` line 41 — `set_value` returns `'queued'` but `get_value('ai_input')` returns `''` after 1.5s.
+- **What's missing:** A `setattr` routing issue in `gui_2.py` similar to the earlier `_UI_FLAG_DEFAULTS` fix. The test's input doesn't actually reach the controller.
+- **Real symptom:** Test fails in batch; same class of bug as the `_UI_FLAG_DEFAULTS` allowlist bug (commit `bcdc26d0`).
+
+#### Gap 5: Tests assert against dirty subprocess state from prior tests
+- **What exists:** Test isolation is implicit (assumes clean state from prior fixture). When a prior test's `set_value` calls pollute the controller, subsequent tests fail in ways unrelated to their code.
+- **What's missing:** A `_reset_controller_state` hook that the `live_gui` fixture exposes, so each test can opt-in to a clean baseline.
+
+---
+
+## Goals
+
+1. **Goal A: Per-test subprocess resilience.** Make the `live_gui` fixture recover from a degraded subprocess BEFORE each test (not just before each file). When the subprocess dies mid-test, the next test gets a fresh one.
+2. **Goal B: Path hygiene for the live_gui workspace.** Refactor `tests/conftest.py:live_gui` to use `tmp_path_factory.mktemp("live_gui_workspace")` and expose the path as a separate fixture. Update all dependent test files to consume the fixture instead of hardcoding the path.
+3. **Goal C: Eliminate `_sync_rag_engine` race.** Add a coalescing/debounce pattern so 5 setters in 100ms produce 1 sync, not 5 parallel syncs.
+4. **Goal D: Fix `set_value` hook routing.** Find the `__setattr__` bug that causes `set_value('ai_input', ...)` to not actually mutate the controller's `ai_input` state, and fix it the same way `_UI_FLAG_DEFAULTS` was fixed.
+5. **Goal E: Test files assert against fresh state.** Add a `_reset_controller_state` fixture that any test can opt into via autouse-on-marker (`@pytest.mark.clean_baseline`).
+6. **Goal F: Verify all 4 upcoming tracks have a clean test bed.** Run the full tier-1 + tier-2 + tier-3 batch and document which tests pass in batch vs. isolation. The 4 upcoming tracks (qwen_llama_grok, data_oriented_error_handling, data_structure_strengthening, mcp_architecture_refactor) start with a known green baseline.
+
+### Non-Goals (Out of Scope)
+
+- ❌ Refactoring the `live_gui` fixture to per-file scope (Solution A in `batch_resilience_plan_20260608.md`). Solution D (autouse health check + respawn) is the surgical alternative; per-file is too coarse.
+- ❌ Refactoring `src/rag_engine.py` to a chunk-based data structure (that's the `chunkification_optimization_20260608_PLACEHOLDER` track).
+- ❌ Migrating `live_gui` tests to mock-based tests (preserves the integration value).
+- ❌ Adding CI infrastructure (this repo has no CI; manual batch runs are the verification).
+- ❌ Fixing the 7 mock_app tests in `test_z_negative_flows.py` (separate code path; deferred).
+- ❌ Fixing the 5 MMA pipeline tests that don't reach "tracks" state (separate code path; deferred).
+- ❌ Fixing the `auto_switch_sim` test (separate code path; deferred).
+- ❌ Doing the `code_path_audit_20260607` work (post-4-tracks; the audit is the post-condition).
+
+---
+
+## Functional Requirements
+
+### FR1. Per-test subprocess health check + respawn
+
+**Where:** `tests/conftest.py:282` (the `live_gui` fixture)
+
+**What:** Add an autouse fixture that runs AFTER `live_gui` and BEFORE each test that uses it. The fixture:
+1. Calls `client.get_gui_health()` with a 1s timeout.
+2. If health is "degraded" OR the response is None OR the call raises, calls `_respawn_subprocess()`.
+3. After respawn (or if health was already OK), verifies the subprocess is alive via the existing `kill_process_tree` machinery.
+
+**API:**
+```python
+@pytest.fixture(autouse=True)
+def _check_live_gui_health(request, live_gui):
+    if "live_gui" in request.fixturenames:
+        handle, _ = live_gui
+        handle.ensure_alive()  # does the health check + respawn
+    yield
+```
+
+**Tests required:**
+- `test_live_gui_respawn_after_kill`: kill the subprocess via the handle, run a no-op test that uses `live_gui`, assert the subprocess is alive at test end.
+- `test_live_gui_health_check_fast_path`: when the subprocess is alive, the health check is <100ms.
+- `test_live_gui_no_respawn_on_clean`: when the subprocess is alive AND `get_gui_health()` returns OK, no respawn happens (verify via a `respawn_count` counter on the handle).
+
+### FR2. Expose `live_gui_workspace` as a separate fixture
+
+**Where:** `tests/conftest.py:282` (the `live_gui` fixture), plus 6 test files
+
+**What:**
+1. Change `live_gui` to create the workspace via `tmp_path_factory.mktemp("live_gui_workspace")` instead of `Path("tests/artifacts/live_gui_workspace")`.
+2. Add a new fixture `live_gui_workspace` that yields the absolute path to the workspace.
+3. The `live_gui` fixture uses `chdir` (or sets the subprocess CWD) to the absolute path; the subprocess inherits the correct CWD.
+4. Update 6 test files to accept `live_gui_workspace` as a fixture parameter and use the absolute path instead of the hardcoded one.
+
+**Tests required:**
+- `test_live_gui_workspace_is_absolute`: assert the workspace path is absolute.
+- `test_live_gui_workspace_unique_per_session`: assert two consecutive sessions get different workspace dirs (per-session `mktemp` returns unique dirs).
+- `test_live_gui_workspace_passed_to_test`: parametrize a test with `live_gui_workspace`, assert the test can create files in it.
+
+**Files to update:**
+- `tests/conftest.py:412` — replace `Path("tests/artifacts/live_gui_workspace")` with `tmp_path_factory.mktemp("live_gui_workspace")`
+- `tests/test_rag_phase4_final_verify.py:20` — accept `live_gui_workspace` fixture
+- `tests/test_rag_phase4_stress.py:21` — accept `live_gui_workspace` fixture
+- `tests/test_saved_presets_sim.py:14, 121` — accept `live_gui_workspace` fixture
+- `tests/test_tool_presets_sim.py:13` — accept `live_gui_workspace` fixture
+- `tests/test_visual_sim_gui_ux.py:79` — accept `live_gui_workspace` fixture
+
+### FR3. Coalesce `_sync_rag_engine` calls
+
+**Where:** `src/app_controller.py:_sync_rag_engine` (or the setter that triggers it)
+
+**What:** Replace the immediate-submit pattern with a debounce/coalesce pattern. Multiple setters within a 100ms window produce ONE sync, run on the next idle moment.
+
+**Approach:** Add a `_rag_sync_token: Optional[int]` and a `_rag_sync_dirty: bool` flag. When a setter mutates `rag_config`, increment the token and set dirty. A background "sync dispatcher" task (or a deferred submit) reads the token, builds the engine once, sets the engine, and clears the flag. If a new setter comes in while a sync is running, increment the token, set dirty, the running sync sees the new token and re-runs once.
+
+**Tests required:**
+- `test_sync_rag_engine_coalesces_five_setters`: fire 5 setters in 50ms, assert only 1 `RAGEngine()` is constructed.
+- `test_sync_rag_engine_rerun_on_token_change`: while a sync is running, fire a setter; assert the sync sees the new token and re-runs once.
+- `test_sync_rag_engine_idempotent_no_changes`: if no setters fire, no sync runs.
+
+### FR4. Fix `set_value` hook routing for `ai_input`
+
+**Where:** `src/gui_2.py:__setattr__` (or `src/app_controller.py:_handle_set_value`)
+
+**What:** Investigate the `__setattr__` / `__setstate__` chain. The test (`tests/test_gui2_set_value_hook_works`) calls `client.set_value('ai_input', 'hello')`, which posts to `/api/gui/set_value`, which calls `controller.<some_method>`. The method either doesn't actually mutate `ai_input` or routes the value to a different attribute (similar to how `_UI_FLAG_DEFAULTS` was incorrectly returning `None`).
+
+**Likely root cause:** Either:
+- The `__setattr__` allowlist only includes certain `ui_` attrs, and `ai_input` is not on it, so the assignment is silently dropped.
+- The `/api/gui/set_value` endpoint has a `field != 'ai_input'` branch that doesn't call the setter.
+
+**Tests required:**
+- `test_set_value_hook_ai_input`: assert that after `set_value('ai_input', 'hello')` and a 0.5s wait, `get_value('ai_input')` returns `'hello'`.
+- `test_set_value_hook_temperature`: same for `temperature`.
+- `test_set_value_hook_persists`: same for `model_name`.
+
+**Diagnostic test (write first):** A test that introspects the controller's `__dict__` and the API hook's parameter-to-handler mapping to find the missing branch.
+
+### FR5. Optional clean-baseline marker
+
+**Where:** `tests/conftest.py` (new fixture), test files that want it
+
+**What:** Add a `@pytest.mark.clean_baseline` marker. An autouse fixture detects the marker and calls a `_reset_controller_state` method on the controller before the test starts. The reset clears: `ai_input`, `ai_status`, `ai_response`, `current_provider`, `current_model`, `rag_config`, `files`, `mma_streams`, `mma_epic_input`, `mma_proposed_tracks`, plus any field set by a prior test.
+
+**API:**
+```python
+@pytest.fixture(autouse=True)
+def _clean_baseline(request, live_gui):
+    if request.node.get_closest_marker("clean_baseline"):
+        handle, _ = live_gui
+        handle.client.reset_session()  # existing endpoint, plus extended reset
+    yield
+```
+
+**Tests required:**
+- `test_clean_baseline_resets_ai_input`: set `ai_input='polluted'`, mark test with `clean_baseline`, assert `ai_input` is `''` at test start.
+- `test_clean_baseline_resets_rag_config`: same for `rag_config`.
+
+### FR6. Verify the 4 upcoming tracks have a clean test bed
+
+**Where:** `scripts/run_tests_batched.py` (no changes); verification in this track's final phase
+
+**What:** Run the full tier-1 + tier-2 + tier-3 batch and document which tests pass. Produce a "test bed health report" as a markdown file in `docs/reports/test_bed_health_20260609.md`. The report lists:
+- Tier-1 unit tests: all pass (already verified in `rag_work_final_20260609_pm.md`)
+- Tier-2 mock_app tests: all pass
+- Tier-3 live_gui tests: pass/fail per file, with the failure mode
+- A "before" / "after" diff so the user can see the impact
+
+---
+
+## Non-Functional Requirements
+
+- **NFR1: Per-test overhead < 200ms.** The autouse `_check_live_gui_health` fixture must add <200ms to each test that uses `live_gui`. The 49 live_gui tests × 200ms = 9.8s additional batch time. Acceptable.
+- **NFR2: No regressions in tier-1 / tier-2.** All unit tests and mock_app tests must continue to pass. The fixture change is additive, not destructive.
+- **NFR3: Backward compat for tests that don't opt in.** Tests that don't use `live_gui` are unaffected. Tests that use `live_gui` but don't opt into `clean_baseline` continue to work (they just don't get a reset).
+- **NFR4: No hardcoded paths to C:/projects/manual_slop or ./tests/artifacts/ in production code.** The track's filesystem-hygiene fix is *enforced* by the existing `scripts/check_test_toml_paths.py` audit (extended to also catch `Path("tests/artifacts/")` and `Path("C:/projects/")` in test files).
+- **NFR5: 1-space indentation.** All Python code in this track uses 1-space indentation per `conductor/product-guidelines.md`.
+- **NFR6: CRLF line endings on Windows.** All Python files in this track use CRLF.
+
+---
+
+## Architecture Reference
+
+This track touches the following subsystems (see linked deep-dive guides):
+
+- **Test infrastructure:** `tests/conftest.py`, `scripts/run_tests_batched.py`. See [docs/guide_testing.md](../docs/guide_testing.md) §"7 conftest fixtures" and §"Puppeteer pattern".
+- **AppController state delegation:** `src/app_controller.py` (166KB). See [docs/guide_app_controller.md](../docs/guide_app_controller.md) §"_predefined_callbacks / _gettable_fields Hook API registries" and [docs/guide_state_lifecycle.md](../docs/guide_state_lifecycle.md) §"State Delegation (__getattr__/__setattr__)".
+- **RAG engine:** `src/rag_engine.py`. See [docs/guide_rag.md](../docs/guide_rag.md) §"RAGEngine lifecycle" and §"Sync to controller".
+- **Hook API:** `src/api_hooks.py` + `src/api_hook_client.py`. See [docs/guide_api_hooks.md](../docs/guide_api_hooks.md) §"/api/gui/set_value" and §"Remote Confirmation Protocol".
+- **io_pool:** `src/app_controller.py:_io_pool`. See [docs/guide_architecture.md](../docs/guide_architecture.md) §"Thread domains".
+
+### Key design constraints inherited
+
+- **Defer-not-catch pattern:** `imgui.*` calls before ImGui is ready crash at the C level (0xc0000005). The `_check_live_gui_health` fixture must NOT touch ImGui directly. It uses the existing Hook API (`/api/gui_health`, `/api/status`) which runs in the hook server thread, not the render thread.
+- **Session-scoped fixture:** `live_gui` is session-scoped by design. Per-file or per-test scoping would break cross-test state (e.g., `test_full_live_workflow` expects a fresh `live_gui`, but `test_rag_phase4_stress` depends on the same subprocess the prior 4 sims used). The autouse respawn is the surgical solution.
+- **tmp_path_factory scope:** `tmp_path_factory.mktemp()` is session-scoped (per the pytest docs). Per-test `tmp_path` is a different fixture. The `live_gui_workspace` fixture must use `tmp_path_factory` to be consistent with the session-scoped `live_gui`.
+
+### Key prior decisions to respect
+
+- The `_UI_FLAG_DEFAULTS` allowlist was a HARD-CODED set. The new `set_value` hook fix should follow the same allowlist pattern (consistency with the existing fix) OR use a class-level attribute that derives from `__init__` annotations (the better fix, but the user has not asked for the better fix; this track stays surgical).
+- The existing `run_tests_batched.py` tier structure (tier-1 unit, tier-2 mock_app, tier-3 live_gui, tier-H headless, tier-P perf) is NOT to be restructured. The track works WITH the existing tier structure.
+- The `audit_main_thread_imports.py` and `audit_weak_types.py` static CI gates are the project's enforcement mechanism. The new `Path("tests/artifacts/")` and `Path("C:/projects/")` patterns are added to `check_test_toml_paths.py` (extended) as a third gate.
+
+---
+
+## Out of Scope
+
+The following are explicitly NOT part of this track. They are mentioned so the user knows they are deferred, not forgotten:
+
+1. **Per-file `live_gui` fixture scope (Solution A from `batch_resilience_plan_20260608.md`):** Not needed if the per-test autouse respawn works. May revisit if the per-test respawn has too much overhead.
+2. **Refactoring `live_gui` fixture to a class-based handle with respawn (Solution B):** Same — only do if per-test respawn is insufficient.
+3. **MMA pipeline tests that don't reach "tracks" state:** 3 tests fail in this pattern (`test_mma_concurrent_tracks_execution`, `test_mma_step_mode_approval_flow`, `test_mma_complete_lifecycle`). These are MMA-engine-state-transition bugs, not test-isolation bugs. Out of scope.
+4. **Negative-flows tests (`test_z_negative_flows.py`):** 3 tests fail in this pattern. They exercise the mock provider's error path. Pre-existing, separate code path. Out of scope.
+5. **`test_auto_switch_sim`:** Workspace auto-switch logic not applying Tier 3 profile. Pre-existing, separate code path. Out of scope.
+6. **`test_prior_session_no_pop_imbalance`:** Already addressed in `live_gui_test_hardening_v2` (commit `26e0ced4`). Verify it still passes.
+7. **`code_path_audit_20260607`:** Post-4-tracks audit. This track unblocks the 4 tracks; the audit runs after.
+8. **`chunkification_optimization_20260608_PLACEHOLDER`:** The comms.log chunkification. Out of scope; the user has not approved it.
+9. **`manual_ux_validation_20260608_PLACEHOLDER`:** The ASCII-sketch workflow. Out of scope; the user has not approved it.
+10. **CI infrastructure:** No CI in this repo. Manual batch runs are the verification.
+
+---
+
+## Verification Criteria
+
+This track is "done" when ALL of the following are true:
+
+1. ✅ All tier-1 unit tests pass in batch (no regression).
+2. ✅ All tier-2 mock_app tests pass in batch (no regression).
+3. ✅ The 6 test files that hardcoded `Path("tests/artifacts/live_gui_workspace")` now use the `live_gui_workspace` fixture.
+4. ✅ `test_rag_phase4_final_verify.py::test_phase4_final_verify` passes in BATCH (after 4 sims) — the primary symptom the user wanted fixed.
+5. ✅ `test_rag_phase4_stress.py` passes in batch OR has a documented reason for the residual flakiness (acceptable per `rag_work_final_20260609_pm.md`'s "out of scope" decision IF the io_pool race fix in FR3 lands).
+6. ✅ `test_gui2_set_value_hook_works` passes in batch.
+7. ✅ The autouse `_check_live_gui_health` fixture is in place; a new test (`test_live_gui_respawn_after_kill`) verifies it.
+8. ✅ The `_sync_rag_engine` coalescing fix is in place; a new test (`test_sync_rag_engine_coalesces_five_setters`) verifies it.
+9. ✅ A `docs/reports/test_bed_health_20260609.md` report is committed, listing pass/fail per test file with the failure mode for any residual failures.
+10. ✅ `scripts/check_test_toml_paths.py` is extended to flag `Path("tests/artifacts/")` and `Path("C:/projects/")` in test files; the audit passes.
+
+---
+
+## Risk Assessment
+
+| Risk | Likelihood | Impact | Mitigation |
+|---|---|---|---|
+| Per-test respawn adds too much overhead (>200ms × 49 tests = 10s) | Medium | Low | Verify with the NFR1 measurement; if exceeded, fall back to per-batch respawn |
+| Per-test respawn breaks cross-test state dependencies | Medium | High | Add a `--no-respawn` pytest flag for tests that need cross-test state; audit the 49 live_gui tests for state dependencies before Phase 1 |
+| `tmp_path_factory.mktemp` changes the workspace path, breaking the on-disk chroma DB persistence assumption | High | Low | Clear `.slop_cache/` dirs at session start; OR add a `live_gui_workspace_persist` opt-in |
+| `_sync_rag_engine` coalescing breaks the existing RAG test that DEPENDS on multiple parallel syncs (unlikely) | Low | Medium | Write the FR3 tests to verify both "5 setters → 1 sync" AND "single setter → single sync" still work |
+| `set_value` hook fix changes behavior for existing tests that assert on the OLD (broken) behavior | Low | High | Run the full tier-3 batch in Phase 3 and verify no regressions |
+| The `tmp_path_factory.mktemp` refactor corrupts `tests/conftest.py` (the previous attempt at this refactor DID corrupt it; commit was reverted per `rag_test_batch_failure_status_20260609_pm3.md`) | High | High | Use `git stash` before each edit; if edit fails, `git stash pop` and try again with `manual-slop_set_file_slice` (which is the recommended surgical tool per `conductor/edit_workflow.md`) |
+
+---
+
+## Phases (summary)
+
+This spec is the entry point. The plan (`plan.md`) breaks these into TDD-ready tasks.
+
+| Phase | Scope | Effort |
+|---|---|---|
+| Phase 1 | Audit: enumerate all `live_gui` cross-test state dependencies, document baseline failure modes | 1 day |
+| Phase 2 | FR1: Per-test subprocess health check + respawn (autouse fixture) | 1 day |
+| Phase 3 | FR2: Expose `live_gui_workspace` as a separate fixture, update 6 test files | 1 day |
+| Phase 4 | FR3: Coalesce `_sync_rag_engine` calls (token + dirty flag pattern) | 1 day |
+| Phase 5 | FR4: Fix `set_value` hook routing for `ai_input` | 1 day |
+| Phase 6 | FR5: Optional `clean_baseline` marker | 0.5 day |
+| Phase 7 | FR6: Run full batch, produce test_bed_health report | 0.5 day |
+| Phase 8 | Docs: update `docs/guide_testing.md` + `docs/guide_state_lifecycle.md` | 0.5 day |
+
+Total: 6.5 days (fits within 1 sprint).
+
+---
+
+## See Also
+
+- **Foundation:** [docs/reports/test_infra_hardening_foundation_20260608.md](../docs/reports/test_infra_hardening_foundation_20260608.md) — original 5-phase plan; this spec supersedes with sharper scope.
+- **Batch resilience:** [docs/reports/batch_resilience_plan_20260608.md](../docs/reports/batch_resilience_plan_20260608.md) — 4 solutions; this spec adopts Solution D (autouse respawn) as primary.
+- **RAG failure status:** [docs/reports/rag_test_batch_failure_status_20260609_pm3.md](../docs/reports/rag_test_batch_failure_status_20260609_pm3.md) — the filesystem hygiene findings that drive FR2.
+- **RAG final report:** [docs/reports/rag_work_final_20260609_pm.md](../docs/reports/rag_work_final_20260609_pm.md) — the io_pool race that drives FR3.
+- **Process anti-patterns:** [conductor/workflow.md](../conductor/workflow.md) §"Process Anti-Patterns (Added 2026-06-09)" — the Deduction Loop and Report-Instead-of-Fix patterns this track is designed to prevent.
+- **Edit workflow:** [conductor/edit_workflow.md](../conductor/edit_workflow.md) — the surgical tool guidance; the conftest refactor MUST use `manual-slop_set_file_slice` after the previous attempt was reverted due to corruption.
+- **Architecture deep-dive:** [docs/guide_testing.md](../docs/guide_testing.md) §"7 conftest fixtures" + [docs/guide_state_lifecycle.md](../docs/guide_state_lifecycle.md) §"State Delegation".
+- **4 upcoming tracks:**
+  - [qwen_llama_grok_integration_20260606](../conductor/tracks/qwen_llama_grok_integration_20260606/) — spec ✓
+  - [data_oriented_error_handling_20260606](../conductor/tracks/data_oriented_error_handling_20260606/) — plan ✓
+  - [data_structure_strengthening_20260606](../conductor/tracks/data_structure_strengthening_20260606/) — plan pending
+  - [mcp_architecture_refactor_20260606](../conductor/tracks/mcp_architecture_refactor_20260606/) — plan pending
+
+---
+
+## Approval Required
+
+This spec requires user approval before the plan is written. Per the conductor workflow:
+
+> The spec is the agent's design intent — it explains WHY, not just WHAT.
+> A plan for an unapproved spec is wasted effort.
+
+The user has asked for a track to "kill the test regression nightmare." This spec defines what "kill" means: 5 surgical fixes (FR1-FR5) + a verification report (FR6) that produces a clean test bed for the 4 upcoming tracks. If the user wants more aggressive scope (e.g., refactoring `live_gui` to per-file scope), revise the spec before approving.
@@ -0,0 +1,142 @@
+# Track state for test_infrastructure_hardening_20260609
+# Updated by Tier 2 Tech Lead as tasks complete
+
+[meta]
+track_id = "test_infrastructure_hardening_20260609"
+name = "Test Infrastructure Hardening (2026-06-09)"
+status = "active"
+current_phase = 0
+last_updated = "2026-06-09"
+
+[blocked_by]
+# No blockers; this track is the foundation for the 4 upcoming tracks
+
+[blocks]
+qwen_llama_grok_integration_20260606 = "planned in this track"
+data_oriented_error_handling_20260606 = "planned in this track"
+data_structure_strengthening_20260606 = "planned in this track"
+mcp_architecture_refactor_20260606 = "planned in this track"
+code_path_audit_20260607 = "planned in this track"
+
+[phases]
+phase_1 = { status = "pending", checkpointsha = "", name = "Audit" }
+phase_2 = { status = "pending", checkpointsha = "", name = "FR1: Per-test subprocess health check + respawn" }
+phase_3 = { status = "pending", checkpointsha = "", name = "FR2: live_gui_workspace fixture + 6 test files" }
+phase_4 = { status = "pending", checkpointsha = "", name = "FR3: Coalesce _sync_rag_engine calls" }
+phase_5 = { status = "pending", checkpointsha = "", name = "FR4: Fix set_value hook for ai_input" }
+phase_6 = { status = "pending", checkpointsha = "", name = "FR5: Optional clean_baseline marker" }
+phase_7 = { status = "pending", checkpointsha = "", name = "FR6: Test bed health report" }
+phase_8 = { status = "pending", checkpointsha = "", name = "Docs + audit script extension" }
+
+[tasks]
+# Phase 1: Audit
+t1_1_1 = { status = "pending", commit_sha = "", description = "Enumerate live_gui test cross-file state dependencies" }
+t1_1_2 = { status = "pending", commit_sha = "", description = "Document set_value/get_value/reset_session per test" }
+t1_1_3 = { status = "pending", commit_sha = "", description = "Categorize self-contained vs cross-test-dependent" }
+t1_2_1 = { status = "pending", commit_sha = "", description = "Find hardcoded tests/artifacts/live_gui_workspace references" }
+t1_2_2 = { status = "pending", commit_sha = "", description = "Find Path('C:/projects/') references in tests" }
+t1_3_1 = { status = "pending", commit_sha = "", description = "Read _sync_rag_engine and its callers" }
+t1_3_2 = { status = "pending", commit_sha = "", description = "Write sync_rag_race.md audit" }
+t1_4_1 = { status = "pending", commit_sha = "", description = "Read /api/gui/set_value endpoint" }
+t1_4_2 = { status = "pending", commit_sha = "", description = "Read __setattr__ and _UI_FLAG_DEFAULTS allowlist" }
+t1_4_3 = { status = "pending", commit_sha = "", description = "Diagnostic test of set_value('ai_input')" }
+t1_4_4 = { status = "pending", commit_sha = "", description = "Write set_value_hook.md audit" }
+
+# Phase 2: FR1
+t2_1_1 = { status = "pending", commit_sha = "", description = "Pre-edit checkpoint (git stash)" }
+t2_1_2 = { status = "pending", commit_sha = "", description = "Read existing live_gui fixture" }
+t2_1_3 = { status = "pending", commit_sha = "", description = "Add _LiveGuiHandle class to conftest.py" }
+t2_1_4 = { status = "pending", commit_sha = "", description = "Refactor live_gui fixture to use handle" }
+t2_1_5 = { status = "pending", commit_sha = "", description = "Update all 49 live_gui test files to use new API" }
+t2_1_6 = { status = "pending", commit_sha = "", description = "Run representative test to verify refactor" }
+t2_1_7 = { status = "pending", commit_sha = "", description = "Commit refactor" }
+t2_2_1 = { status = "pending", commit_sha = "", description = "Write failing test in tests/test_live_gui_respawn.py" }
+t2_2_2 = { status = "pending", commit_sha = "", description = "Confirm test FAILS" }
+t2_2_3 = { status = "pending", commit_sha = "", description = "Add autouse _check_live_gui_health fixture" }
+t2_2_4 = { status = "pending", commit_sha = "", description = "Confirm test PASSES" }
+t2_2_5 = { status = "pending", commit_sha = "", description = "Run full tier-3 batch to verify no regression" }
+t2_2_6 = { status = "pending", commit_sha = "", description = "Commit autouse fixture" }
+
+# Phase 3: FR2
+t3_1_1 = { status = "pending", commit_sha = "", description = "Pre-edit checkpoint" }
+t3_1_2 = { status = "pending", commit_sha = "", description = "Refactor live_gui to use tmp_path_factory.mktemp" }
+t3_1_3 = { status = "pending", commit_sha = "", description = "Verify fixture still spawns correctly" }
+t3_1_4 = { status = "pending", commit_sha = "", description = "Verify workspace is in tmp dir" }
+t3_1_5 = { status = "pending", commit_sha = "", description = "Commit tmp_path_factory refactor" }
+t3_2_1 = { status = "pending", commit_sha = "", description = "Write failing test in tests/test_live_gui_workspace_fixture.py" }
+t3_2_2 = { status = "pending", commit_sha = "", description = "Confirm test FAILS" }
+t3_2_3 = { status = "pending", commit_sha = "", description = "Add live_gui_workspace fixture" }
+t3_2_4 = { status = "pending", commit_sha = "", description = "Confirm test PASSES" }
+t3_2_5 = { status = "pending", commit_sha = "", description = "Commit live_gui_workspace fixture" }
+t3_3_1 = { status = "pending", commit_sha = "", description = "Read each of 6 test files, identify hardcoded reference" }
+t3_3_2 = { status = "pending", commit_sha = "", description = "Refactor 6 test files to use fixture (one at a time)" }
+t3_3_3 = { status = "pending", commit_sha = "", description = "Run each updated test file in isolation" }
+t3_3_4 = { status = "pending", commit_sha = "", description = "Run in batch to verify the RAG test passes after 4 sims" }
+t3_3_5 = { status = "pending", commit_sha = "", description = "Commit 6-file refactor" }
+
+# Phase 4: FR3
+t4_1_1 = { status = "pending", commit_sha = "", description = "Read existing _sync_rag_engine and setters" }
+t4_1_2 = { status = "pending", commit_sha = "", description = "Add coalescing state to AppController.__init__" }
+t4_1_3 = { status = "pending", commit_sha = "", description = "Write failing test in tests/test_sync_rag_engine_coalescing.py" }
+t4_1_4 = { status = "pending", commit_sha = "", description = "Confirm test FAILS" }
+t4_1_5 = { status = "pending", commit_sha = "", description = "Refactor _sync_rag_engine to use token + dirty flag" }
+t4_1_6 = { status = "pending", commit_sha = "", description = "Confirm test PASSES" }
+t4_1_7 = { status = "pending", commit_sha = "", description = "Run RAG test in batch to verify race is fixed" }
+t4_1_8 = { status = "pending", commit_sha = "", description = "Commit io_pool race fix" }
+
+# Phase 5: FR4
+t5_1_1 = { status = "pending", commit_sha = "", description = "Read failing test test_gui2_set_value_hook_works.py" }
+t5_1_2 = { status = "pending", commit_sha = "", description = "Run test to confirm failure" }
+t5_1_3 = { status = "pending", commit_sha = "", description = "Trace flow with diagnostic (in tests/artifacts, not src/)" }
+t5_2_1 = { status = "pending", commit_sha = "", description = "Apply surgical fix" }
+t5_2_2 = { status = "pending", commit_sha = "", description = "Verify test passes" }
+t5_2_3 = { status = "pending", commit_sha = "", description = "Commit set_value fix" }
+
+# Phase 6: FR5
+t6_1_1 = { status = "pending", commit_sha = "", description = "Add clean_baseline marker to pyproject.toml" }
+t6_1_2 = { status = "pending", commit_sha = "", description = "Write failing test in tests/test_clean_baseline_marker.py" }
+t6_1_3 = { status = "pending", commit_sha = "", description = "Confirm test FAILS" }
+t6_1_4 = { status = "pending", commit_sha = "", description = "Add autouse _reset_clean_baseline fixture" }
+t6_1_5 = { status = "pending", commit_sha = "", description = "Confirm test PASSES" }
+t6_1_6 = { status = "pending", commit_sha = "", description = "Commit clean_baseline marker" }
+
+# Phase 7: FR6
+t7_1_1 = { status = "pending", commit_sha = "", description = "Run tier-1 unit tests" }
+t7_1_2 = { status = "pending", commit_sha = "", description = "Run tier-2 mock_app tests" }
+t7_1_3 = { status = "pending", commit_sha = "", description = "Run tier-3 live_gui tests" }
+t7_1_4 = { status = "pending", commit_sha = "", description = "Summarize pass/fail" }
+t7_2_1 = { status = "pending", commit_sha = "", description = "Write docs/reports/test_bed_health_20260609.md" }
+t7_2_2 = { status = "pending", commit_sha = "", description = "Commit test_bed_health report" }
+
+# Phase 8: Docs + audit
+t8_1_1 = { status = "pending", commit_sha = "", description = "Read existing check_test_toml_paths.py" }
+t8_1_2 = { status = "pending", commit_sha = "", description = "Add new patterns to audit script" }
+t8_1_3 = { status = "pending", commit_sha = "", description = "Run audit to verify 0 violations" }
+t8_1_4 = { status = "pending", commit_sha = "", description = "Write TDD test for the audit" }
+t8_1_5 = { status = "pending", commit_sha = "", description = "Confirm test PASSES" }
+t8_1_6 = { status = "pending", commit_sha = "", description = "Commit audit extension" }
+t8_2_1 = { status = "pending", commit_sha = "", description = "Read existing guide_testing.md" }
+t8_2_2 = { status = "pending", commit_sha = "", description = "Add §8 Per-test subprocess resilience" }
+t8_2_3 = { status = "pending", commit_sha = "", description = "Commit docs update" }
+
+[verification]
+phase_1_audits_committed = false
+phase_2_respawn_fixture_works = false
+phase_3_rag_test_passes_in_batch = false
+phase_4_io_pool_race_fixed = false
+phase_5_set_value_works_in_batch = false
+phase_6_clean_baseline_marker_works = false
+phase_7_test_bed_health_report_committed = false
+phase_8_docs_and_audit_extended = false
+
+[baseline_capture]
+# Captured in Phase 0 of the plan
+# Will be populated by Tier 2 before Phase 1 begins
+tier_1_status = "TBD"
+tier_2_status = "TBD"
+tier_3_status = "TBD"
+batch_log = "TBD"
+
+[user_corrections_log]
+# Record user-corrections here as the track progresses
+# Format: phase_num, original_claim, correction, reason