From 566cf08cb8dca1c64efad3a632757eefc10cc20a Mon Sep 17 00:00:00 2001 From: Ed_ Date: Tue, 9 Jun 2026 15:15:26 -0400 Subject: [PATCH] conductor(track): test_infrastructure_hardening_20260609 - spec to kill the test regression nightmare --- .../metadata.json | 78 ++ .../plan.md | 1064 +++++++++++++++++ .../spec.md | 346 ++++++ .../state.toml | 142 +++ 4 files changed, 1630 insertions(+) create mode 100644 conductor/tracks/test_infrastructure_hardening_20260609/metadata.json create mode 100644 conductor/tracks/test_infrastructure_hardening_20260609/plan.md create mode 100644 conductor/tracks/test_infrastructure_hardening_20260609/spec.md create mode 100644 conductor/tracks/test_infrastructure_hardening_20260609/state.toml diff --git a/conductor/tracks/test_infrastructure_hardening_20260609/metadata.json b/conductor/tracks/test_infrastructure_hardening_20260609/metadata.json new file mode 100644 index 00000000..fb74ec62 --- /dev/null +++ b/conductor/tracks/test_infrastructure_hardening_20260609/metadata.json @@ -0,0 +1,78 @@ +{ + "track_id": "test_infrastructure_hardening_20260609", + "name": "Test Infrastructure Hardening (2026-06-09)", + "created_at": "2026-06-09", + "status": "spec", + "priority": "A", + "blocked_by": [], + "blocks": [ + "qwen_llama_grok_integration_20260606", + "data_oriented_error_handling_20260606", + "data_structure_strengthening_20260606", + "mcp_architecture_refactor_20260606", + "code_path_audit_20260607" + ], + "inherits_from": [ + "docs/reports/test_infra_hardening_foundation_20260608.md", + "docs/reports/batch_resilience_plan_20260608.md", + "docs/reports/rag_test_batch_failure_status_20260609_pm3.md", + "docs/reports/rag_work_final_20260609_pm.md" + ], + "supersedes": [ + "test_harness_hardening_20260310", + "test_patch_fixes_20260513", + "test_batching_post_refactor_polish_20260607", + "fix_remaining_tests_20260513", + "manual_ux_validation_20260608_PLACEHOLDER (per FR5 clean_baseline)", + "regression_fixes_20260605 (residual live_gui work)" + ], + "domain": "Meta-Tooling (test infrastructure; not the Application's GUI)", + "scope_summary": "Fix 3 root causes of test regression churn (subprocess state pollution, filesystem path hygiene, io_pool race) + 2 related bugs (set_value hook, optional clean-baseline) so the 4 upcoming tracks start from a clean test bed.", + "estimated_effort": "6.5 days (Phases 1-8)", + "phases": 8, + "verification_criteria": [ + "FR1: Autouse _check_live_gui_health fixture in place; 3 tests in tests/test_live_gui_respawn.py pass", + "FR2: 6 test files no longer hardcode Path('tests/artifacts/live_gui_workspace'); live_gui_workspace fixture in place; 3 tests in tests/test_live_gui_workspace_fixture.py pass", + "FR3: _sync_rag_engine uses token + dirty flag; 3 tests in tests/test_sync_rag_engine_coalescing.py pass", + "FR4: set_value('ai_input', ...) actually mutates controller state; tests/test_gui2_set_value_hook_works.py passes in batch", + "FR5: clean_baseline marker in place; 2 tests in tests/test_clean_baseline_marker.py pass", + "FR6: docs/reports/test_bed_health_20260609.md written and committed with pass/fail counts", + "Audit: 4 audit files committed in conductor/tracks/test_infrastructure_hardening_20260609/audit/", + "Audit: scripts/check_test_toml_paths.py extended to flag hardcoded workspace paths", + "Docs: docs/guide_testing.md updated with new fixtures (FR1, FR2, FR5)", + "All tier-1 + tier-2 tests pass in batch (no regression)", + "At least 3 previously-failing tests now pass in batch (the RAG test, the set_value test, the RAG stress test)" + ], + "out_of_scope": [ + "Per-file live_gui fixture scope (Solution A from batch_resilience_plan)", + "MMA pipeline tests that don't reach 'tracks' state (3 tests, separate code path)", + "Negative-flows tests (3 tests, separate code path)", + "test_auto_switch_sim (separate code path)", + "code_path_audit_20260607 (post-4-tracks)", + "chunkification_optimization_20260608_PLACEHOLDER (not yet approved)", + "CI infrastructure (no CI in repo)" + ], + "risks": [ + { + "risk": "Per-test respawn adds >200ms per test (NFR1 violation)", + "mitigation": "Measure with the 49 tests in batch; if exceeded, fall back to per-batch respawn" + }, + { + "risk": "tmp_path_factory refactor breaks on-disk chroma DB persistence", + "mitigation": "Clear .slop_cache/ dirs at session start; OR add a live_gui_workspace_persist opt-in" + }, + { + "risk": "conftest.py corruption (previous attempt was reverted)", + "mitigation": "git stash before each edit; use manual-slop_set_file_slice; Tier 2 supervises" + }, + { + "risk": "set_value fix changes behavior for existing tests that assert on the OLD broken behavior", + "mitigation": "Run full tier-3 batch in Phase 5 and verify no regressions" + } + ], + "tier_2_supervision_required_for": [ + "Phase 1 (audit review)", + "Phase 3 (conftest refactor)", + "Phase 4 (io_pool race fix)" + ] +} diff --git a/conductor/tracks/test_infrastructure_hardening_20260609/plan.md b/conductor/tracks/test_infrastructure_hardening_20260609/plan.md new file mode 100644 index 00000000..b1dfe2d2 --- /dev/null +++ b/conductor/tracks/test_infrastructure_hardening_20260609/plan.md @@ -0,0 +1,1064 @@ +# Test Infrastructure Hardening — Implementation Plan + +> **For Tier 3 workers:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. +> +> **Tier 2 supervision required:** Phase 1, Phase 3 (the conftest refactor), and Phase 4 (the `_sync_rag_engine` race fix) MUST be supervised by a Tier 2 Tech Lead. These touch the session-scoped `live_gui` fixture and the controller's hot path; the prior attempt at the conftest refactor was reverted due to corruption (see `docs/reports/rag_test_batch_failure_status_20260609_pm3.md`). + +**Goal:** Fix the 3 root causes of test regression churn (subprocess state pollution, filesystem path hygiene, io_pool race) + 2 related bugs (set_value hook, optional clean-baseline) so the 4 upcoming tracks (qwen_llama_grok, data_oriented_error_handling, data_structure_strengthening, mcp_architecture_refactor) start from a known green baseline. + +**Architecture:** Each phase is self-contained, with TDD: failing test first, then minimum implementation, then verify pass, then commit. Per-task atomic commits. No batching. + +**Tech Stack:** Python 3.11+, pytest, FastAPI/Uvicorn (live_gui), tmp_path_factory, threading.Lock. + +--- + +## Pre-Phase 0: Tier 2 checkpoint + dirty-state audit + +Before starting Phase 1, the Tier 2 Tech Lead must: + +- [ ] **Step 0.1: Read all referenced reports** + - `docs/reports/rag_test_batch_failure_status_20260609_pm3.md` (filesystem hygiene findings) + - `docs/reports/rag_work_final_20260609_pm.md` (io_pool race, set_value hook) + - `docs/reports/test_infra_hardening_foundation_20260608.md` (foundation, 5 phases) + - `docs/reports/batch_resilience_plan_20260608.md` (4 batch-resilience solutions) + - `conductor/edit_workflow.md` (surgical tool guidance) + +- [ ] **Step 0.2: Verify the dirty working tree is safe** + - Working tree currently has uncommitted changes in `config.toml`, `manualslop_layout.ini`, `project_history.toml`, `src/warmup.py`. These are user workspace artifacts, NOT test infrastructure. + - **Do NOT commit these.** They are out of scope. + - Use `git stash --keep-index` or commit them separately if the user requests. + +- [ ] **Step 0.3: Run the current batch baseline to capture "before" state** + ```powershell + cd C:\projects\manual_slop; uv run .\scripts\run_tests_batched.py 2>&1 | Tee-Object -FilePath "tests\artifacts\batch_baseline_20260609.log" | Select-Object -Last 50 + ``` + Expected: tier-1 + tier-2 pass; tier-3 has the documented failures (RAG dim-mismatch, set_value hook, RAG phase4 final verify, RAG phase4 stress). + +--- + +## Phase 1: Audit (no code changes) + +Focus: Catalog the existing state so Phases 2-7 have a data-grounded baseline. + +### Task 1.1: Enumerate `live_gui` test cross-file state dependencies + +**Files:** +- Read: `tests/conftest.py:282-547` (the `live_gui` fixture) +- Read: all 49+ test files that use `live_gui` + +- [ ] **Step 1.1.1: Generate the live_gui test inventory** + ```powershell + cd C:\projects\manual_slop; uv run python -c " + from pathlib import Path + import re + root = Path('tests') + files = sorted(root.glob('test_*.py')) + users = [] + for f in files: + text = f.read_text(encoding='utf-8') + if 'live_gui' in text: + users.append(f.name) + print(f'{len(users)} test files use live_gui:') + for u in users: + print(f' {u}') + " + ``` + Save output to `conductor/tracks/test_infrastructure_hardening_20260609/audit/live_gui_users.txt`. + +- [ ] **Step 1.1.2: For each live_gui test file, grep for `set_value` calls and `get_value` calls** + ```powershell + cd C:\projects\manual_slop; foreach ($f in (Get-Content conductor/tracks/test_infrastructure_hardening_20260609/audit/live_gui_users.txt)) { + Write-Host "=== $f ==="; + Select-String -Path "tests\$f" -Pattern '(set_value|get_value|reset_session)' | Select-Object LineNumber, Line | Format-Table -AutoSize + } | Tee-Object -FilePath "conductor/tracks/test_infrastructure_hardening_20260609/audit/live_gui_state_io.txt" + ``` + Save output to the audit directory. This shows which tests read state set by other tests. + +- [ ] **Step 1.1.3: Categorize each test as "self-contained" or "cross-test-dependent"** + Self-contained = no `set_value` calls OR all `set_value` calls are within the same test function. + Cross-test-dependent = has `get_value` calls that depend on a prior test's `set_value`. + + Save the categorization to `conductor/tracks/test_infrastructure_hardening_20260609/audit/live_gui_dependencies.json`: + ```json + { + "self_contained": ["test_a.py", "test_b.py", ...], + "cross_test_dependent": ["test_x.py::test_y", ...] + } + ``` + + - [ ] **Step 1.1.4: Commit the audit** + ```powershell + cd C:\projects\manual_slop; git add conductor/tracks/test_infrastructure_hardening_20260609/audit/ + git commit -m "conductor(audit): catalog live_gui test cross-file state dependencies" + ``` + +### Task 1.2: Document the current `live_gui_workspace` path-hygiene state + +- [ ] **Step 1.2.1: Find all hardcoded references to `tests/artifacts/live_gui_workspace`** + ```powershell + cd C:\projects\manual_slop; rg -n "tests/artifacts/live_gui_workspace" tests/ --type py | Tee-Object -FilePath "conductor/tracks/test_infrastructure_hardening_20260609/audit/hardcoded_paths.txt" + ``` + Expect 7+ matches per the spec's "Files affected" list. + +- [ ] **Step 1.2.2: Find all `Path("C:/projects/")` or `Path("C:\\\\projects\\\\")` references in test files** + ```powershell + cd C:\projects\manual_slop; rg -n 'Path\("C:[/\\]+projects' tests/ --type py | Tee-Object -FilePath "conductor/tracks/test_infrastructure_hardening_20260609/audit/hardcoded_project_root.txt" + ``` + Expect 0+ matches (the spec says none in production code; verify in tests too). + +- [ ] **Step 1.2.3: Commit the audit** + ```powershell + cd C:\projects\manual_slop; git add conductor/tracks/test_infrastructure_hardening_20260609/audit/hardcoded_*.txt + git commit -m "conductor(audit): document hardcoded workspace paths in test suite" + ``` + +### Task 1.3: Document the current `_sync_rag_engine` race + +- [ ] **Step 1.3.1: Read `src/app_controller.py:_sync_rag_engine` and its callers** + Use `manual-slop_py_get_definition` to read `_sync_rag_engine`. Identify: + - The set of setters that trigger sync (e.g., `rag_collection_name`, `files`, `rag_enabled`, `rag_source`, `rag_emb_provider`). + - The submit-to-io_pool call site. + - Whether there's any existing coalescing/debouncing. + +- [ ] **Step 1.3.2: Write the audit to `conductor/tracks/test_infrastructure_hardening_20260609/audit/sync_rag_race.md`** + Format: + ```markdown + # _sync_rag_engine Race Audit + + ## Setters that trigger sync + - `set_rag_collection_name` (src/app_controller.py:N) + - `set_rag_enabled` (src/app_controller.py:N) + - `set_files` (src/app_controller.py:N) + - ... + + ## Submit pattern + [paste 5-10 lines of the submit call] + + ## Coalescing mechanism + [None / Token-based / Lock-based / etc.] + + ## Race scenario + 1. Test fires setter A → submit task T1 + 2. Test fires setter B (50ms later) → submit task T2 + 3. T1 starts on io_pool thread, starts constructing RAGEngine + 4. T2 starts on a different io_pool thread, starts constructing RAGEngine + 5. T1 finishes first, sets self.rag_engine = engine_A + 6. T2 finishes, sets self.rag_engine = engine_B + 7. Test queries self.rag_engine → engine_B (last writer wins) + 8. engine_B may not have indexed the file from setter A → test fails + ``` + +- [ ] **Step 1.3.3: Commit the audit** + ```powershell + cd C:\projects\manual_slop; git add conductor/tracks/test_infrastructure_hardening_20260609/audit/sync_rag_race.md + git commit -m "conductor(audit): document _sync_rag_engine race in controller" + ``` + +### Task 1.4: Document the `set_value` hook for `ai_input` + +- [ ] **Step 1.4.1: Read `src/api_hooks.py` `/api/gui/set_value` endpoint** + Use `manual-slop_py_get_definition` to find the endpoint. Identify the parameter-to-handler mapping. + +- [ ] **Step 1.4.2: Read `src/gui_2.py:__setattr__` and the `_UI_FLAG_DEFAULTS` allowlist** + Use `manual-slop_py_get_definition` to read both. Verify the allowlist is in place (from commit `bcdc26d0`). + +- [ ] **Step 1.4.3: Test the failing case directly via the live_gui fixture** + Write a diagnostic test (NOT yet committed) that: + 1. Gets the live_gui fixture. + 2. Calls `client.set_value('ai_input', 'hello')`. + 3. Waits 0.5s. + 4. Calls `client.get_value('ai_input')`. + 5. Prints the result. + + Run with: `cd C:\projects\manual_slop; uv run pytest -s -xvs --no-header tests/test_gui2_set_value_hook_works.py 2>&1 | Select-Object -Last 30` + + If the test fails, read the API hooks endpoint to find the missing branch. + +- [ ] **Step 1.4.4: Write the audit to `conductor/tracks/test_infrastructure_hardening_20260609/audit/set_value_hook.md`** + Format: + ```markdown + # set_value('ai_input') Audit + + ## Endpoint code path + [paste the relevant 10-20 lines from /api/gui/set_value] + + ## Expected flow + 1. POST /api/gui/set_value with {"field": "ai_input", "value": "hello"} + 2. Endpoint calls controller.set_ai_input("hello") (or similar) + 3. Controller sets self.ai_input = "hello" + 4. Subsequent get_value('ai_input') returns "hello" + + ## Actual flow (from diagnostic) + 1. POST returns 'queued' + 2. Controller does NOT set self.ai_input + 3. Subsequent get_value returns '' + + ## Root cause + [Identify the missing branch — likely the /api/gui/set_value endpoint has a hardcoded list of fields it handles, and 'ai_input' is not on the list, OR the controller's __setattr__ drops the assignment] + + ## Fix location + [src/api_hooks.py:N or src/gui_2.py:N] + ``` + +- [ ] **Step 1.4.5: Commit the audit** + ```powershell + cd C:\projects\manual_slop; git add conductor/tracks/test_infrastructure_hardening_20260609/audit/set_value_hook.md + git commit -m "conductor(audit): trace set_value('ai_input') flow to find routing bug" + ``` + +### Phase 1 verification + +- [ ] **Step 1.V.1: All 4 audit files committed** + - `audit/live_gui_users.txt` + - `audit/live_gui_state_io.txt` + - `audit/live_gui_dependencies.json` + - `audit/hardcoded_paths.txt` + - `audit/hardcoded_project_root.txt` + - `audit/sync_rag_race.md` + - `audit/set_value_hook.md` + +- [ ] **Step 1.V.2: User reviews the audit** + - Tier 2 Tech Lead presents the audit to the user. + - User approves before Phase 2 begins. + +--- + +## Phase 2: FR1 — Per-test subprocess health check + respawn + +Focus: Add an autouse fixture that recovers the `live_gui` subprocess before each test, when degraded. + +### Task 2.1: Add a `_LiveGuiHandle` class with `ensure_alive()` + +**Files:** +- Modify: `tests/conftest.py` (add `_LiveGuiHandle` class BEFORE the `live_gui` fixture) + +- [ ] **Step 2.1.1: Pre-edit checkpoint (Tier 2 supervised)** + ```powershell + cd C:\projects\manual_slop; git stash push -m "wip before Phase 2" + ``` + (The current working tree has user workspace artifacts that should NOT be in this commit; stash them and re-apply after Phase 2's commit.) + +- [ ] **Step 2.1.2: Read the existing `live_gui` fixture** + Read `tests/conftest.py:282-547` with `manual-slop_get_file_slice`. Note: + - The current `live_gui` fixture creates a subprocess at line 412-450. + - The fixture's `finally` block (line 516-547) calls `kill_process_tree`. + - The fixture yields `(process, gui_script)` (a tuple). + +- [ ] **Step 2.1.3: Refactor `live_gui` to use a `_LiveGuiHandle` class** + Insert a new class BEFORE the `live_gui` fixture (around line 280): + ```python + class _LiveGuiHandle: + def __init__(self, gui_script: str, workspace: Path, log_path: Path) -> None: + self._gui_script = gui_script + self._workspace = workspace + self._log_path = log_path + self._process: subprocess.Popen | None = None + self._lock = threading.Lock() + self._respawn_count = 0 + self._spawn() + + def _spawn(self) -> None: + # Existing fixture spawn logic, lifted from conftest.py:412-450 + # (use the actual spawn logic from the current fixture) + ... + + def is_alive(self) -> bool: + return self._process is not None and self._process.poll() is None + + def ensure_alive(self) -> None: + with self._lock: + if not self.is_alive(): + self._respawn_count += 1 + self._spawn() + + @property + def process(self) -> subprocess.Popen: + self.ensure_alive() + assert self._process is not None + return self._process + + @property + def respawn_count(self) -> int: + return self._respawn_count + ``` + + **CRITICAL — 1-space indent.** Use the exact pattern from `tests/conftest.py`. Do not introduce 4-space indent. + + Use `manual-slop_py_add_def` to insert the class at `top` of the file. Verify the indent via `ast.parse` after insertion. + +- [ ] **Step 2.1.4: Refactor the `live_gui` fixture to use the handle** + Change the fixture from yielding a tuple `(process, gui_script)` to yielding a `_LiveGuiHandle` instance. Update the docstring. + + Use `manual-slop_set_file_slice` to replace the fixture's body. Verify the indent. + +- [ ] **Step 2.1.5: Update all 49 live_gui tests to use the new API** + The current pattern is: + ```python + def test_x(live_gui): + process, gui_script = live_gui + ``` + The new pattern is: + ```python + def test_x(live_gui): + handle = live_gui + process = handle.process + ``` + + This is a sweep across all 49 test files. Use `rg` to find all `process, gui_script = live_gui` lines, then sed/Python to replace. + + ```powershell + cd C:\projects\manual_slop; uv run python -c " + from pathlib import Path + root = Path('tests') + count = 0 + for f in root.glob('test_*.py'): + text = f.read_text(encoding='utf-8') + if 'process, gui_script = live_gui' in text: + new_text = text.replace('process, gui_script = live_gui', 'handle = live_gui') + f.write_text(new_text, encoding='utf-8') + count += 1 + print(f'Updated {count} test files') + " + ``` + This is a single in-place edit; verify with `git diff --stat`. + +- [ ] **Step 2.1.6: Run a representative live_gui test to verify the refactor** + ```powershell + cd C:\projects\manual_slop; uv run pytest tests/test_gui_startup_smoke.py -v --timeout=30 + ``` + Expected: PASS. If FAIL, revert via `git checkout tests/` and re-investigate. + +- [ ] **Step 2.1.7: Commit the refactor (Tier 2 supervised)** + ```powershell + cd C:\projects\manual_slop; git add tests/ + git commit -m "refactor(test): wrap live_gui subprocess in _LiveGuiHandle class" + $h = git log -1 --format='%H' + git notes add -m "Refactor the session-scoped live_gui fixture to yield a _LiveGuiHandle instead of a (process, gui_script) tuple. The handle has ensure_alive() and respawn_count. All 49 dependent test files updated to consume the handle. Foundation for the autouse _check_live_gui_health fixture in Task 2.2." $h + ``` + +### Task 2.2: Add the autouse `_check_live_gui_health` fixture + +**Files:** +- Modify: `tests/conftest.py` (add the autouse fixture AFTER the `live_gui` fixture) + +- [ ] **Step 2.2.1: Write a failing test (TDD red)** + Create `tests/test_live_gui_respawn.py`: + ```python + import pytest + import time + + def test_live_gui_respawn_after_kill(live_gui): + handle = live_gui + initial_pid = handle.process.pid + initial_respawn_count = handle.respawn_count + handle.process.kill() + handle.process.wait(timeout=5) + assert not handle.is_alive() + handle.ensure_alive() + assert handle.is_alive() + new_pid = handle.process.pid + assert new_pid != initial_pid + assert handle.respawn_count == initial_respawn_count + 1 + + def test_live_gui_no_respawn_on_clean(live_gui): + handle = live_gui + initial_count = handle.respawn_count + handle.ensure_alive() + assert handle.respawn_count == initial_count + + def test_live_gui_health_check_fast_path(live_gui): + handle = live_gui + t0 = time.perf_counter() + handle.ensure_alive() + elapsed = time.perf_counter() - t0 + assert elapsed < 0.1, f"ensure_alive took {elapsed:.3f}s on a clean subprocess" + ``` + +- [ ] **Step 2.2.2: Run the test to confirm it FAILS** + ```powershell + cd C:\projects\manual_slop; uv run pytest tests/test_live_gui_respawn.py -v --timeout=30 + ``` + Expected: FAIL (the `respawn_count` attribute doesn't exist yet). + +- [ ] **Step 2.2.3: Add the autouse fixture to `tests/conftest.py`** + Insert AFTER the `live_gui` fixture: + ```python + @pytest.fixture(autouse=True) + def _check_live_gui_health(request, live_gui): + if "live_gui" in request.fixturenames: + handle = live_gui + handle.ensure_alive() + yield + ``` + Use `manual-slop_py_add_def` with anchor_type `after` and anchor_symbol `live_gui`. + +- [ ] **Step 2.2.4: Run the test to confirm it PASSES** + ```powershell + cd C:\projects\manual_slop; uv run pytest tests/test_live_gui_respawn.py -v --timeout=30 + ``` + Expected: 3 tests PASS. + +- [ ] **Step 2.2.5: Run the full tier-3 live_gui batch to verify no regression** + ```powershell + cd C:\projects\manual_slop; uv run pytest tests/ -k "live_gui" -v --timeout=30 -x 2>&1 | Select-Object -Last 50 + ``` + Expected: Most tests pass; the documented failures (RAG dim-mismatch, set_value, RAG phase4) still fail. NO new failures. + +- [ ] **Step 2.2.6: Commit (Tier 2 supervised)** + ```powershell + cd C:\projects\manual_slop; git add tests/conftest.py tests/test_live_gui_respawn.py + git commit -m "feat(test): autouse _check_live_gui_health recovers from degraded subprocess" + $h = git log -1 --format='%H' + git notes add -m "Adds an autouse fixture that calls handle.ensure_alive() before each test that uses live_gui. If the subprocess is dead, it respawns. If alive, the check is <100ms. Three new tests in tests/test_live_gui_respawn.py verify the respawn, the no-op-on-clean path, and the performance budget." $h + ``` + +### Phase 2 verification + +- [ ] **Step 2.V.1: 3 new tests in `tests/test_live_gui_respawn.py` pass** +- [ ] **Step 2.V.2: No new regressions in tier-3 batch** +- [ ] **Step 2.V.3: User reviews the autouse respawn behavior** + - Per-test respawn adds <200ms per test. Verify with the 49 tests in batch. + - User approves before Phase 3 begins. + +--- + +## Phase 3: FR2 — `live_gui_workspace` fixture + update 6 test files + +Focus: Eliminate hardcoded `Path("tests/artifacts/live_gui_workspace")` from test files. Use `tmp_path_factory.mktemp`. + +**Tier 2 supervised for entire phase** (the prior attempt at this refactor was reverted due to corruption; see `docs/reports/rag_test_batch_failure_status_20260609_pm3.md`). + +### Task 3.1: Refactor `live_gui` to use `tmp_path_factory.mktemp` + +**Files:** +- Modify: `tests/conftest.py` (the `live_gui` fixture's workspace creation) + +- [ ] **Step 3.1.1: Pre-edit checkpoint** + ```powershell + cd C:\projects\manual_slop; git add . && git commit -m "wip: pre-Phase 3 checkpoint" --allow-empty + ``` + +- [ ] **Step 3.1.2: Use `manual-slop_set_file_slice` to replace the workspace creation** + Read `tests/conftest.py:410-414` with `manual-slop_get_file_slice`. Note the EXACT text of the lines that create the workspace (the `Path("tests/artifacts/live_gui_workspace")` reference and the surrounding `os.makedirs` or `mkdir` calls). + + Replace ONLY those lines with: + ```python + workspace = tmp_path_factory.mktemp("live_gui_workspace") + ``` + where `tmp_path_factory` is added to the fixture's parameters. + + The fixture signature changes from: + ```python + def live_gui(request): + ``` + to: + ```python + def live_gui(request, tmp_path_factory): + ``` + + **CRITICAL — verify via `ast.parse` after the edit.** + + Use `manual-slop_py_check_syntax tests/conftest.py` to confirm syntax is valid. + +- [ ] **Step 3.1.3: Verify the fixture still spawns the subprocess correctly** + ```powershell + cd C:\projects\manual_slop; uv run pytest tests/test_gui_startup_smoke.py -v --timeout=30 + ``` + Expected: PASS. If FAIL, the workspace path is being constructed wrong. + +- [ ] **Step 3.1.4: Verify the new workspace is a tmp dir (not under project tree)** + Add a debug print to the test: + ```powershell + cd C:\projects\manual_slop; uv run pytest tests/test_gui_startup_smoke.py -v --timeout=30 -s 2>&1 | Select-String "workspace" + ``` + Expect: workspace is under `C:\Users\...\AppData\Local\Temp\...`, NOT `C:\projects\manual_slop\tests\artifacts\...`. + +- [ ] **Step 3.1.5: Commit (Tier 2 supervised)** + ```powershell + cd C:\projects\manual_slop; git add tests/conftest.py + git commit -m "refactor(test): live_gui workspace via tmp_path_factory" + $h = git log -1 --format='%H' + git notes add -m "Replaces the hardcoded Path('tests/artifacts/live_gui_workspace') with tmp_path_factory.mktemp('live_gui_workspace'). The workspace now lives in pytest's tmp dir, not in the project tree. Foundation for exposing the workspace path as a separate fixture in Task 3.2." $h + ``` + +### Task 3.2: Expose `live_gui_workspace` as a separate fixture + +**Files:** +- Modify: `tests/conftest.py` (add a new fixture) + +- [ ] **Step 3.2.1: Write a failing test (TDD red)** + Create `tests/test_live_gui_workspace_fixture.py`: + ```python + from pathlib import Path + + def test_live_gui_workspace_is_absolute(live_gui_workspace): + assert live_gui_workspace.is_absolute() + + def test_live_gui_workspace_unique_per_session(live_gui, live_gui_workspace): + assert live_gui_workspace.exists() + assert (live_gui_workspace / ".placeholder").exists() or True # fixture is empty + + def test_live_gui_workspace_passed_to_test(live_gui_workspace): + test_file = live_gui_workspace / "test_file.txt" + test_file.write_text("hello") + assert test_file.read_text() == "hello" + ``` + +- [ ] **Step 3.2.2: Run the test to confirm it FAILS** + ```powershell + cd C:\projects\manual_slop; uv run pytest tests/test_live_gui_workspace_fixture.py -v --timeout=30 + ``` + Expected: FAIL (no `live_gui_workspace` fixture yet). + +- [ ] **Step 3.2.3: Add the `live_gui_workspace` fixture to `tests/conftest.py`** + Insert AFTER the `live_gui` fixture: + ```python + @pytest.fixture + def live_gui_workspace(live_gui) -> Path: + handle = live_gui + return handle._workspace # type: ignore[attr-defined] + ``` + The handle has the workspace as `_workspace` (set in Task 2.1.3). + + Use `manual-slop_py_add_def` with `anchor_type=after, anchor_symbol=live_gui`. + +- [ ] **Step 3.2.4: Run the test to confirm it PASSES** + ```powershell + cd C:\projects\manual_slop; uv run pytest tests/test_live_gui_workspace_fixture.py -v --timeout=30 + ``` + Expected: 3 tests PASS. + +- [ ] **Step 3.2.5: Commit** + ```powershell + cd C:\projects\manual_slop; git add tests/conftest.py tests/test_live_gui_workspace_fixture.py + git commit -m "feat(test): expose live_gui_workspace as a separate fixture" + $h = git log -1 --format='%H' + git notes add -m "Adds the live_gui_workspace fixture that returns the absolute path to the live_gui subprocess's workspace. Tests that need to create files in the workspace should request this fixture instead of hardcoding Path('tests/artifacts/live_gui_workspace')." $h + ``` + +### Task 3.3: Update the 6 dependent test files + +**Files:** +- Modify: 6 test files that hardcode the workspace path + +- [ ] **Step 3.3.1: Read each test file and identify the hardcoded reference** + For each of: + - `tests/test_rag_phase4_final_verify.py:20` + - `tests/test_rag_phase4_stress.py:21` + - `tests/test_saved_presets_sim.py:14, 121` + - `tests/test_tool_presets_sim.py:13` + - `tests/test_visual_sim_gui_ux.py:79` + + Read the surrounding 5 lines with `manual-slop_get_file_slice` to understand the context. The pattern is: + ```python + workspace = Path("tests/artifacts/live_gui_workspace") + workspace.mkdir(parents=True, exist_ok=True) + # ... use workspace + ``` + +- [ ] **Step 3.3.2: For each test file, refactor to use the fixture** + For each file, do this surgical edit: + 1. Add `live_gui_workspace` to the test function's parameter list. + 2. Replace `Path("tests/artifacts/live_gui_workspace")` with `live_gui_workspace`. + 3. Remove the `mkdir` call (the fixture creates the dir). + 4. Use `live_gui_workspace.mkdir(parents=True, exist_ok=True)` ONLY if subsequent code needs the dir to exist before the fixture's init (rare). + + Use `manual-slop_edit_file` for each replacement. **One file at a time. Verify after each.** + +- [ ] **Step 3.3.3: Run each updated test file to verify the refactor** + For each of the 6 files, run: + ```powershell + cd C:\projects\manual_slop; uv run pytest tests/test_rag_phase4_final_verify.py -v --timeout=60 + uv run pytest tests/test_rag_phase4_stress.py -v --timeout=60 + uv run pytest tests/test_saved_presets_sim.py -v --timeout=60 + uv run pytest tests/test_tool_presets_sim.py -v --timeout=60 + uv run pytest tests/test_visual_sim_gui_ux.py -v --timeout=60 + ``` + Expected: Each file passes in isolation. If any fails, the refactor broke something — investigate. + +- [ ] **Step 3.3.4: Run the same files in batch to verify the BATCH failure is fixed** + ```powershell + cd C:\projects\manual_slop; uv run pytest tests/test_extended_sims.py tests/test_rag_phase4_final_verify.py -v --timeout=120 + ``` + Expected: The RAG test PASSES after the 4 sims. **This is the primary symptom the user wanted fixed.** + +- [ ] **Step 3.3.5: Commit (Tier 2 supervised)** + ```powershell + cd C:\projects\manual_slop; git add tests/ + git commit -m "refactor(test): 6 test files use live_gui_workspace fixture instead of hardcoded path" + $h = git log -1 --format='%H' + git notes add -m "The 6 test files that hardcoded Path('tests/artifacts/live_gui_workspace') now request the live_gui_workspace fixture, which yields the absolute path. The RAG test passes in batch (after 4 sims) for the first time, because the workspace path is now absolute and CWD-independent." $h + ``` + +### Phase 3 verification + +- [ ] **Step 3.V.1: 6 test files updated and pass in isolation** +- [ ] **Step 3.V.2: RAG test passes in batch (after 4 sims)** — the primary goal +- [ ] **Step 3.V.3: `tests/test_live_gui_workspace_fixture.py` 3 tests pass** +- [ ] **Step 3.V.4: User reviews the RAG test passing in batch** + - This is the "kill the nightmare" moment. User confirms before Phase 4. + +--- + +## Phase 4: FR3 — Coalesce `_sync_rag_engine` calls + +Focus: Eliminate the io_pool race in `app_controller._sync_rag_engine` so multiple setters in quick succession produce one sync, not N parallel syncs. + +**Tier 2 supervised for entire phase.** This touches the controller's hot path. + +### Task 4.1: Add a token-based coalescing mechanism + +**Files:** +- Modify: `src/app_controller.py` (the `_sync_rag_engine` method and the setters that trigger it) + +- [ ] **Step 4.1.1: Read the existing `_sync_rag_engine` and the setters** + Use `manual-slop_py_get_definition` on `AppController._sync_rag_engine`. Identify: + - The exact submit-to-io_pool call. + - The setters that call `_sync_rag_engine` (search for `_sync_rag_engine` usages). + +- [ ] **Step 4.1.2: Add the coalescing state to `AppController.__init__`** + Add to `AppController.__init__` (use `manual-slop_py_set_var_declaration`): + ```python + self._rag_sync_token: int = 0 + self._rag_sync_dirty: bool = False + self._rag_sync_lock: threading.Lock = threading.Lock() + ``` + +- [ ] **Step 4.1.3: Write a failing test (TDD red)** + Create `tests/test_sync_rag_engine_coalescing.py`: + ```python + from unittest.mock import patch, MagicMock + from src.app_controller import AppController + + def test_sync_rag_engine_coalesces_five_setters(): + # Construct a minimal AppController (use existing fixture if available) + # Patch the io_pool to count sync submissions + with patch("src.app_controller.AppController._io_pool") as mock_pool: + ctrl = AppController(...) + for i in range(5): + ctrl.set_rag_collection_name(f"name_{i}") + # Assert: mock_pool.submit was called 0 times (or 1 time, with 5 setters coalesced) + ... + + def test_sync_rag_engine_rerun_on_token_change(): + ... + + def test_sync_rag_engine_idempotent_no_changes(): + ... + ``` + + Note: This test may require the existing test fixture for `AppController`. If no such fixture exists, use a minimal one (construct the controller with a tmp_path, mock the heavy dependencies). + +- [ ] **Step 4.1.4: Run the test to confirm it FAILS** + ```powershell + cd C:\projects\manual_slop; uv run pytest tests/test_sync_rag_engine_coalescing.py -v --timeout=30 + ``` + Expected: FAIL (no coalescing yet). + +- [ ] **Step 4.1.5: Refactor `_sync_rag_engine` to use the token + dirty flag** + Use `manual-slop_py_update_definition` to replace `_sync_rag_engine`: + ```python + def _sync_rag_engine(self) -> None: + with self._rag_sync_lock: + self._rag_sync_token += 1 + self._rag_sync_dirty = True + token = self._rag_sync_token + self._io_pool.submit(self._do_rag_sync, token) + + def _do_rag_sync(self, token: int) -> None: + while True: + with self._rag_sync_lock: + if token != self._rag_sync_token: + return # a newer sync will pick up our changes + self._rag_sync_dirty = False + # Build the engine, set self.rag_engine + ... + with self._rag_sync_lock: + if not self._rag_sync_dirty: + return + token = self._rag_sync_token + self._rag_sync_dirty = False + ``` + + The exact body of `_do_rag_sync` should be the existing body of `_sync_rag_engine` (renamed). The `if not dirty: return` check at the end ensures we only loop when a NEW setter has fired. + + **CRITICAL — thread safety.** The lock protects `token` and `dirty`. The body of the sync runs WITHOUT the lock (to avoid blocking other setters). + +- [ ] **Step 4.1.6: Run the test to confirm it PASSES** + ```powershell + cd C:\projects\manual_slop; uv run pytest tests/test_sync_rag_engine_coalescing.py -v --timeout=30 + ``` + Expected: 3 tests PASS. + +- [ ] **Step 4.1.7: Run the RAG test in batch to verify the race is fixed** + ```powershell + cd C:\projects\manual_slop; uv run pytest tests/test_extended_sims.py tests/test_rag_phase4_final_verify.py tests/test_rag_phase4_stress.py -v --timeout=120 + ``` + Expected: All 3 pass. The RAG stress test was previously non-deterministic; the coalescing makes it deterministic. + +- [ ] **Step 4.1.8: Commit (Tier 2 supervised)** + ```powershell + cd C:\projects\manual_slop; git add src/app_controller.py tests/test_sync_rag_engine_coalescing.py + git commit -m "fix(rag): coalesce _sync_rag_engine calls via token + dirty flag" + $h = git log -1 --format='%H' + git notes add -m "Replaces the immediate-submit-to-io_pool pattern with a token + dirty flag. Multiple setters in quick succession produce one sync, not N parallel syncs. The RAG stress test, which was non-deterministic, is now deterministic. The lock is held only for token/dirty access; the sync body runs lock-free to avoid blocking other setters." $h + ``` + +### Phase 4 verification + +- [ ] **Step 4.V.1: 3 new tests in `tests/test_sync_rag_engine_coalescing.py` pass** +- [ ] **Step 4.V.2: RAG stress test passes in batch (no longer non-deterministic)** +- [ ] **Step 4.V.3: No regressions in tier-2 mock_app batch** +- [ ] **Step 4.V.4: User reviews the io_pool race fix** + - User confirms before Phase 5. + +--- + +## Phase 5: FR4 — Fix `set_value` hook for `ai_input` + +Focus: Find the missing branch in `/api/gui/set_value` that causes `set_value('ai_input', ...)` to silently drop the assignment. + +### Task 5.1: Reproduce the failure with a minimal test + +- [ ] **Step 5.1.1: Read the test that's currently failing** + Use `manual-slop_get_file_slice` to read `tests/test_gui2_set_value_hook_works.py:1-50`. + +- [ ] **Step 5.1.2: Run the test to confirm the failure** + ```powershell + cd C:\projects\manual_slop; uv run pytest tests/test_gui2_set_value_hook_works.py -v --timeout=30 + ``` + Expected: FAIL. The failure mode is `set_value returns 'queued' but get_value('ai_input') returns ''`. + +- [ ] **Step 5.1.3: Trace the flow with diagnostic prints** + The user's previous attempt at this diagnostic was rejected as "diagnostic noise in production." Use a temporary diagnostic file instead: + - Create `tests/artifacts/diag_set_value.py` (gitignored). + - Add prints to trace the flow. + - Run the test with `pytest -s` to see the prints. + - Once the root cause is identified, DELETE `tests/artifacts/diag_set_value.py` and apply the real fix. + +### Task 5.2: Apply the fix + +**Files (TBD based on Task 5.1 findings):** +- Likely: `src/api_hooks.py` (the `/api/gui/set_value` endpoint) +- Possibly: `src/gui_2.py` (`__setattr__` or `_UI_FLAG_DEFAULTS` allowlist) + +- [ ] **Step 5.2.1: Apply the surgical fix** + Use `manual-slop_edit_file` or `manual-slop_py_update_definition` as appropriate. + +- [ ] **Step 5.2.2: Run the test to verify the fix** + ```powershell + cd C:\projects\manual_slop; uv run pytest tests/test_gui2_set_value_hook_works.py -v --timeout=30 + ``` + Expected: PASS. + +- [ ] **Step 5.2.3: Commit (Tier 2 supervised)** + ```powershell + cd C:\projects\manual_slop; git add src/ + git commit -m "fix(api_hooks): set_value('ai_input') actually mutates controller state" + $h = git log -1 --format='%H' + git notes add -m "Identifies the missing branch in the /api/gui/set_value endpoint that caused ai_input to silently drop. The fix is consistent with the _UI_FLAG_DEFAULTS allowlist pattern from bcdc26d0." $h + ``` + +### Phase 5 verification + +- [ ] **Step 5.V.1: `tests/test_gui2_set_value_hook_works.py` passes in batch** +- [ ] **Step 5.V.2: No regressions in tier-3 batch** + +--- + +## Phase 6: FR5 — Optional `clean_baseline` marker + +Focus: Add a marker that tests can opt into for a clean controller state. + +### Task 6.1: Add the marker and the autouse fixture + +**Files:** +- Modify: `tests/conftest.py` +- Modify: `pyproject.toml` (add the marker to `[tool.pytest.ini_options].markers`) + +- [ ] **Step 6.1.1: Add the marker to `pyproject.toml`** + Read `pyproject.toml` and find `[tool.pytest.ini_options]`. Add: + ```toml + "clean_baseline: mark a test as requiring a clean controller state at start. The autouse _reset_clean_baseline fixture will call /api/reset_session before the test." + ``` + to the existing `markers` list. + +- [ ] **Step 6.1.2: Write a failing test (TDD red)** + Create `tests/test_clean_baseline_marker.py`: + ```python + import pytest + + @pytest.mark.clean_baseline + def test_clean_baseline_ai_input_is_empty(live_gui): + handle = live_gui + client = handle.api_client + client.set_value("ai_input", "polluted value") + # The autouse fixture should reset BEFORE this point, but we set it AFTER to verify the reset works mid-test... actually no, the autouse runs BEFORE the test body. + # So this test should verify that get_value('ai_input') is '' at the START of the test. + # We need a different test for that. + pass + + @pytest.mark.clean_baseline + def test_clean_baseline_resets_ai_input_at_start(live_gui): + # The PREVIOUS test set ai_input to "polluted value". If clean_baseline worked, this test's ai_input is ''. + # But wait — the autouse runs BEFORE this test, so we need to verify that AFTER the autouse reset, ai_input is ''. + handle = live_gui + value = handle.api_client.get_value("ai_input") + assert value == "", f"Expected empty ai_input at start of clean_baseline test, got {value!r}" + ``` + +- [ ] **Step 6.1.3: Run the test to confirm it FAILS** + ```powershell + cd C:\projects\manual_slop; uv run pytest tests/test_clean_baseline_marker.py -v --timeout=30 + ``` + Expected: FAIL (no `clean_baseline` autouse yet). + +- [ ] **Step 6.1.4: Add the autouse fixture to `tests/conftest.py`** + Insert AFTER the `_check_live_gui_health` fixture: + ```python + @pytest.fixture(autouse=True) + def _reset_clean_baseline(request, live_gui): + if request.node.get_closest_marker("clean_baseline"): + handle = live_gui + handle.api_client.reset_session() # existing endpoint + yield + ``` + Use `manual-slop_py_add_def` with `anchor_type=after, anchor_symbol=_check_live_gui_health`. + +- [ ] **Step 6.1.5: Run the test to confirm it PASSES** + ```powershell + cd C:\projects\manual_slop; uv run pytest tests/test_clean_baseline_marker.py -v --timeout=30 + ``` + Expected: 2 tests PASS. + +- [ ] **Step 6.1.6: Commit** + ```powershell + cd C:\projects\manual_slop; git add tests/conftest.py tests/test_clean_baseline_marker.py pyproject.toml + git commit -m "feat(test): clean_baseline marker resets controller state before test" + $h = git log -1 --format='%H' + git notes add -m "Adds an opt-in clean_baseline marker. Tests marked with @pytest.mark.clean_baseline get a fresh controller state via the existing /api/reset_session endpoint before they start. Two new tests verify the marker works." $h + ``` + +### Phase 6 verification + +- [ ] **Step 6.V.1: 2 new tests pass** +- [ ] **Step 6.V.2: User reviews the marker API** + - User confirms before Phase 7. + +--- + +## Phase 7: FR6 — Run full batch + produce test_bed_health report + +Focus: Capture the post-track "after" state. Document what's green, what's red, and what's expected to remain red. + +### Task 7.1: Run the full batched suite + +- [ ] **Step 7.1.1: Run tier-1 (unit tests)** + ```powershell + cd C:\projects\manual_slop; uv run .\scripts\run_tests_batched.py 2>&1 | Tee-Object -FilePath "tests\artifacts\post_track_batch_20260609.log" | Select-Object -First 200 + ``` + Expected: all tier-1 batches pass. + +- [ ] **Step 7.1.2: Run tier-2 (mock_app tests)** + Same command, but capture the tier-2 portion. + +- [ ] **Step 7.1.3: Run tier-3 (live_gui tests)** + Same command, but capture the tier-3 portion. Note: This is the big one; may take 10+ minutes. + +- [ ] **Step 7.1.4: Summarize pass/fail** + From the captured log, extract: + - Total tests run. + - Tests passed. + - Tests failed (with file:line and error message). + +### Task 7.2: Produce the test_bed_health report + +- [ ] **Step 7.2.1: Write `docs/reports/test_bed_health_20260609.md`** + Template: + ```markdown + # Test Bed Health Report (2026-06-09) + + **Track:** test_infrastructure_hardening_20260609 + **Date:** 2026-06-09 + **Status:** [GREEN / YELLOW / RED] + + ## Summary + + | Tier | Tests | Pass | Fail | New Failures | Resolved | + |---|---|---|---|---|---| + | tier-1 unit | N | N | 0 | 0 | 0 | + | tier-2 mock_app | N | N | 0 | 0 | 0 | + | tier-3 live_gui | N | N | M | 0 | K | + | tier-H headless | N | N | 0 | 0 | 0 | + | tier-P perf | N | N | 0 | 0 | 0 | + + ## Before vs. After + + | Symptom | Before | After | Resolved By | + |---|---|---|---| + | test_rag_phase4_final_verify in batch | FAIL | PASS | FR2 (tmp_path_factory) | + | test_rag_phase4_stress in batch | FLAKY | PASS | FR3 (io_pool coalescing) | + | test_gui2_set_value_hook_works | FAIL | PASS | FR4 (set_value fix) | + | Per-test subprocess death | POISONS BATCH | RECOVERS | FR1 (autouse respawn) | + | Hardcoded paths in test files | 6 files | 0 files | FR2 (live_gui_workspace fixture) | + | io_pool race in _sync_rag_engine | YES | NO | FR3 (token + dirty flag) | + + ## Known Residual Failures + + - `test_mma_concurrent_tracks_execution` (FAIL, separate code path, MMA engine state transitions) + - `test_mma_step_mode_approval_flow` (FAIL, same) + - `test_mma_complete_lifecycle` (FAIL, same) + - `test_z_negative_flows.py` x3 (FAIL, mock provider error path) + - `test_auto_switch_sim` (FAIL, workspace auto-switch logic) + + These are documented as separate code paths, NOT test-isolation issues. They are deferred to follow-up tracks. + + ## Verification + + ```powershell + uv run .\scripts\run_tests_batched.py 2>&1 | Tee-Object -FilePath "tests\artifacts\post_track_batch_20260609.log" + ``` + + Full log saved to `tests/artifacts/post_track_batch_20260609.log`. + + ## Conclusion + + The 4 upcoming tracks (qwen_llama_grok, data_oriented_error_handling, data_structure_strengthening, mcp_architecture_refactor) can start from a clean baseline. The "test regression nightmare" is killed for the categories the user identified: state pollution, path hygiene, and io_pool race. + ``` + +- [ ] **Step 7.2.2: Commit the report** + ```powershell + cd C:\projects\manual_slop; git add docs/reports/test_bed_health_20260609.md tests/artifacts/post_track_batch_20260609.log + git commit -m "docs(report): test_bed_health_20260609 - post-track batch status" + $h = git log -1 --format='%H' + git notes add -m "Captures the post-track batch state. All 3 root causes of test regression churn (state pollution, path hygiene, io_pool race) are fixed. The 4 upcoming tracks can start from a clean baseline." $h + ``` + +### Phase 7 verification + +- [ ] **Step 7.V.1: Tier-1, tier-2, tier-3 batch results captured in the report** +- [ ] **Step 7.V.2: 0 new failures vs. baseline (Phase 0 capture)** +- [ ] **Step 7.V.3: At least 3 previously-failing tests now pass in batch** (the "after" row of the table) + +--- + +## Phase 8: Docs + extension of `check_test_toml_paths.py` + +Focus: Update the existing audit script to flag the hardcoded-path anti-pattern, and refresh the testing guide. + +### Task 8.1: Extend `scripts/check_test_toml_paths.py` to flag `Path("tests/artifacts/")` and `Path("C:/projects/")` + +**Files:** +- Modify: `scripts/check_test_toml_paths.py` + +- [ ] **Step 8.1.1: Read the existing script** + Use `manual-slop_get_file_summary` on the script. Identify the regex/pattern matching logic. + +- [ ] **Step 8.1.2: Add the new patterns** + Add to the script's pattern list: + - `r'Path\(["\']tests/artifacts/["\']\)'` + - `r'Path\(["\']C:[/\\]+projects'` + + These patterns match test files that hardcode the workspace path or the user's project root. + +- [ ] **Step 8.1.3: Run the audit to verify it flags the right files** + ```powershell + cd C:\projects\manual_slop; uv run python scripts/check_test_toml_paths.py --strict + ``` + Expected: 0 violations (the 6 files were updated in Phase 3). + +- [ ] **Step 8.1.4: Write a TDD test for the audit** + Create `tests/test_check_test_toml_paths.py`: + ```python + def test_audit_flags_hardcoded_workspace_path(tmp_path): + bad_file = tmp_path / "test_bad.py" + bad_file.write_text('workspace = Path("tests/artifacts/live_gui_workspace")\n') + # Run the audit on tmp_path + result = subprocess.run( + ["python", "scripts/check_test_toml_paths.py", "--strict", str(tmp_path)], + capture_output=True, text=True + ) + assert result.returncode != 0 + assert "test_bad.py" in result.stdout + + def test_audit_passes_clean_file(tmp_path): + good_file = tmp_path / "test_good.py" + good_file.write_text("workspace = live_gui_workspace\n") + result = subprocess.run( + ["python", "scripts/check_test_toml_paths.py", "--strict", str(tmp_path)], + capture_output=True, text=True + ) + assert result.returncode == 0 + ``` + +- [ ] **Step 8.1.5: Run the test to confirm it PASSES** + ```powershell + cd C:\projects\manual_slop; uv run pytest tests/test_check_test_toml_paths.py -v --timeout=15 + ``` + Expected: 2 tests PASS. + +- [ ] **Step 8.1.6: Commit** + ```powershell + cd C:\projects\manual_slop; git add scripts/check_test_toml_paths.py tests/test_check_test_toml_paths.py + git commit -m "feat(audit): flag hardcoded workspace and project-root paths in tests" + $h = git log -1 --format='%H' + git notes add -m "Extends check_test_toml_paths.py to also flag Path('tests/artifacts/...') and Path('C:/projects/...') in test files. These are the two anti-patterns that the 6 test files in Phase 3 used to violate. Two new tests verify the audit." $h + ``` + +### Task 8.2: Update `docs/guide_testing.md` to document the new fixtures + +**Files:** +- Modify: `docs/guide_testing.md` + +- [ ] **Step 8.2.1: Read the existing guide** + Use `manual-slop_get_file_summary` to map the structure. + +- [ ] **Step 8.2.2: Add a new section "8. Per-test subprocess resilience"** + Document: + - The `_LiveGuiHandle` class. + - The `_check_live_gui_health` autouse fixture. + - The `live_gui_workspace` fixture. + - The `clean_baseline` marker. + + ~50 lines of new content. + +- [ ] **Step 8.2.3: Commit** + ```powershell + cd C:\projects\manual_slop; git add docs/guide_testing.md + git commit -m "docs(testing): document live_gui handle + workspace fixture + clean_baseline marker" + $h = git log -1 --format='%H' + git notes add -m "Adds a new section to guide_testing.md documenting the _LiveGuiHandle, _check_live_gui_health, live_gui_workspace, and clean_baseline marker. The section is placed in §8 (after the 7 conftest fixtures in §7)." $h + ``` + +### Phase 8 verification + +- [ ] **Step 8.V.1: `check_test_toml_paths.py --strict` passes with 0 violations** +- [ ] **Step 8.V.2: 2 new tests for the audit pass** +- [ ] **Step 8.V.3: `docs/guide_testing.md` updated** + +--- + +## Final Verification + +- [ ] **All 5 FR1-FR5 implemented with TDD tests** +- [ ] **All 4 audits committed in Phase 1** +- [ ] **Test bed health report written and committed** +- [ ] **`docs/guide_testing.md` updated** +- [ ] **No new failures in tier-1 / tier-2 / tier-3 batch** +- [ ] **At least 3 previously-failing tests now pass in batch** + +The track is done when the user reviews the test_bed_health report and confirms that the 4 upcoming tracks (qwen_llama_grok, data_oriented_error_handling, data_structure_strengthening, mcp_architecture_refactor) can start from a clean baseline. + +--- + +## Execution Constraints + +- **Tier 2 supervision required for:** Phase 1 (audit review), Phase 3 (conftest refactor), Phase 4 (io_pool race fix). These are the highest-risk phases. +- **Per-task atomic commits.** One commit per task, never batch. +- **Commit message format:** `(): `. +- **Git note format:** 3-8 lines per commit. +- **Style baseline:** 1-space indent, no comments, type hints, CRLF on Windows. +- **TDD discipline:** Failing test first. No implementation before the red phase is confirmed. +- **No diagnostic noise in production.** All diagnostic stderr goes to `tests/artifacts/*.diag.log`, never to `src/*.py`. Per `AGENTS.md` "No Diagnostic Noise in Production" rule. +- **Deduction loop cap:** 2 test runs per investigation. If a test fails twice, read the code, predict the failure mode, instrument in one pass, then run a third time. If it still fails, escalate to the user. +- **Conftest corruption safety:** Before ANY edit to `tests/conftest.py`, run `git stash` (or `git add . && git commit --allow-empty`). If the edit fails, `git stash pop` and re-investigate. The previous attempt at the conftest refactor was reverted due to corruption. diff --git a/conductor/tracks/test_infrastructure_hardening_20260609/spec.md b/conductor/tracks/test_infrastructure_hardening_20260609/spec.md new file mode 100644 index 00000000..64cda569 --- /dev/null +++ b/conductor/tracks/test_infrastructure_hardening_20260609/spec.md @@ -0,0 +1,346 @@ +# Track Specification: Test Infrastructure Hardening (2026-06-09) + +> **Status:** SPEC FOR APPROVAL. The user has asked for a single track to "kill the test regression nightmare" so the 4 upcoming tracks (qwen_llama_grok, data_oriented_error_handling, data_structure_strengthening, mcp_architecture_refactor) can land on a clean test bed. +> +> **Inheritance:** This track absorbs and supersedes: +> - `docs/reports/test_infra_hardening_foundation_20260608.md` (foundation, 5 phases proposed) +> - `docs/reports/batch_resilience_plan_20260608.md` (4 solutions; Solution A + C recommended) +> - `docs/reports/rag_test_batch_failure_status_20260609_pm3.md` (filesystem hygiene findings #1-5) +> - `docs/reports/rag_work_final_20260609_pm.md` (remaining failures: io_pool race, set_value hook) +> - The implicit "fix test in batch" goal that has been chasing the Tier 2 for 4+ days + +--- + +## Overview + +The test suite has accumulated 49+ live_gui tests that share a single session-scoped subprocess. Recent regression hunts have surfaced 3 distinct failure modes that keep re-emerging under different masks: + +1. **Subprocess state pollution** — the 4 sims in `test_extended_sims.py` mutate controller state (`current_provider`, `ui_*` attrs, MMA workflows, RAG sync); subsequent tests in the same batch read dirty state. +2. **Filesystem hygiene** — the `live_gui` fixture creates `tests/artifacts/live_gui_workspace/` as a HARDCODED relative path; 6 test files re-derive the path independently; `RAGEngine.index_file` joins `base_dir + file_path` with `base_dir` possibly being a relative path, so indexing silently no-ops in batch (the root cause of the RAG test batch failure). +3. **io_pool race in `_sync_rag_engine`** — multiple setters in quick succession submit parallel sync tasks, last-finished-wins, indexing is non-deterministic. + +Each of these has been "fixed" in isolation (RAG dim-mismatch recursion, CWD fallback, embedding provider error surface, ini_content str/bytes sentinel, indent on `_capture_workspace_profile`) but the underlying architectural problems remain. The Tier 2 keeps finding new symptoms. + +**This track kills the nightmare by fixing the three root causes with surgical, contained, testable changes that the 4 upcoming tracks need as a precondition.** + +--- + +## Current State Audit (as of 2026-06-09) + +### Already Implemented (DO NOT re-implement) + +- ✅ `live_gui` fixture exists at `tests/conftest.py:282` (session-scoped) +- ✅ Fixture kills subprocess on teardown (`tests/conftest.py:516-547`) +- ✅ `/api/gui_health` endpoint surfaces degraded state (commit `1c565da7`) +- ✅ Pre-flight `get_gui_health()` check in `test_full_live_workflow` (commit `51ecace4`) +- ✅ `try/except` around `immapp.run` (commit `1c565da7`) +- ✅ `_UI_FLAG_DEFAULTS` allowlist for `__getattr__` (commit `bcdc26d0`) +- ✅ `_ini_capture_ready` defer-not-catch flag for `imgui.save_ini_settings_to_memory` (commit `d7487af4`) +- ✅ `_capture_workspace_profile` indent fix (sub-track 1 of `live_gui_test_hardening_v2`, commit `26e0ced4`) +- ✅ `ini_content` str/bytes contract test (`tests/test_workspace_profile_serialization.py`) +- ✅ `LogPruner` busy-loop backoff (commit `ac08ee87`) +- ✅ RAG dim-mismatch wipe (commit `64bc04a6`) +- ✅ RAG `_validate_collection_dim` recursion fix (commit `644d88ab`) +- ✅ RAG `index_file` CWD fallback (commit `eb8357ec`, uncommitted as of report; needs to be committed as defensive fix) +- ✅ `sentence-transformers` available in dev env via `[local-rag]` extra (commit `a341d7a7`) +- ✅ `_sync_rag_engine` surfaces embedding_provider init failure (commit `e62266e8`) +- ✅ `test_required_test_dependencies.py` enforces test-time deps (commit `b801b11c`) +- ✅ `isolate_workspace`, `reset_paths`, `reset_ai_client`, `vlogger` autouse fixtures +- ✅ `audit_main_thread_imports.py` and `audit_weak_types.py` static CI gates +- ✅ `check_test_toml_paths.py` audit script (CI gate for real-TOML references) +- ✅ Batch tier-1 + tier-2 + tier-3 + tier-H + tier-P structure (`scripts/run_tests_batched.py`) + +### Gaps to Fill (This Track's Scope) + +#### Gap 1: `live_gui` subprocess scope + per-test dirty-state guard +- **What exists:** Session-scoped `live_gui` fixture. Subprocess state survives across 49+ tests. +- **What's missing:** When a test dies (IM_ASSERT, error result, etc.) the subprocess is degraded; subsequent tests in different files get dirty state. The pre-flight `get_gui_health()` check is file-local, not test-local, and only checks health, doesn't recover. +- **Real symptom:** `test_rag_phase4_final_verify` passes in isolation, fails in batch. `test_gui2_set_value_hook_works` returns `''` instead of queued value. `test_rag_phase4_stress` non-deterministic indexing. + +#### Gap 2: Filesystem hygiene for `live_gui_workspace` +- **What exists:** `tests/conftest.py:412` hardcodes `Path("tests/artifacts/live_gui_workspace")`. 6 test files re-derive the same path independently. +- **What's missing:** The path is relative to CWD. When the test runner or prior tests shift CWD, all downstream path joins break. `RAGEngine.index_file` joins `base_dir + file_path`; when `base_dir` is relative and CWD has drifted, the file doesn't exist, indexing silently no-ops. +- **Real symptom:** RAG test in batch finds 0 documents in collection. `chroma_test_final_verify` count=0. `chroma_db` collection count=0. `chroma_test_stress` count=0. Only `chroma_manual_slop` (the user's project, NOT a test) has 328 docs from a separate session. +- **Files affected:** + - `tests/conftest.py:412` (HARDCODED) + - `tests/test_rag_phase4_final_verify.py:20` + - `tests/test_rag_phase4_stress.py:21` + - `tests/test_saved_presets_sim.py:14, 121` + - `tests/test_tool_presets_sim.py:13` + - `tests/test_visual_sim_gui_ux.py:79` + +#### Gap 3: `_sync_rag_engine` io_pool race +- **What exists:** `src/app_controller.py` `_sync_rag_engine` submits a sync task to `_io_pool` for each `set_value` that mutates `rag_config`. Multiple setters in quick succession → multiple parallel sync tasks → non-deterministic indexing. +- **What's missing:** A coalescing/debounce pattern that serializes sync attempts within a short window (e.g., 100ms). +- **Real symptom:** Test fires 5 setters (`rag_collection_name`, `files`, `rag_enabled`, `rag_source`, `rag_emb_provider`) in succession. Each submits a sync. The last one to *finish* wins, but indexing happens against whichever engine finished last. The test then asserts on the wrong engine's output. + +#### Gap 4: `set_value` hook test failure (pre-existing, separate code path) +- **What exists:** `test_gui2_set_value_hook_works` line 41 — `set_value` returns `'queued'` but `get_value('ai_input')` returns `''` after 1.5s. +- **What's missing:** A `setattr` routing issue in `gui_2.py` similar to the earlier `_UI_FLAG_DEFAULTS` fix. The test's input doesn't actually reach the controller. +- **Real symptom:** Test fails in batch; same class of bug as the `_UI_FLAG_DEFAULTS` allowlist bug (commit `bcdc26d0`). + +#### Gap 5: Tests assert against dirty subprocess state from prior tests +- **What exists:** Test isolation is implicit (assumes clean state from prior fixture). When a prior test's `set_value` calls pollute the controller, subsequent tests fail in ways unrelated to their code. +- **What's missing:** A `_reset_controller_state` hook that the `live_gui` fixture exposes, so each test can opt-in to a clean baseline. + +--- + +## Goals + +1. **Goal A: Per-test subprocess resilience.** Make the `live_gui` fixture recover from a degraded subprocess BEFORE each test (not just before each file). When the subprocess dies mid-test, the next test gets a fresh one. +2. **Goal B: Path hygiene for the live_gui workspace.** Refactor `tests/conftest.py:live_gui` to use `tmp_path_factory.mktemp("live_gui_workspace")` and expose the path as a separate fixture. Update all dependent test files to consume the fixture instead of hardcoding the path. +3. **Goal C: Eliminate `_sync_rag_engine` race.** Add a coalescing/debounce pattern so 5 setters in 100ms produce 1 sync, not 5 parallel syncs. +4. **Goal D: Fix `set_value` hook routing.** Find the `__setattr__` bug that causes `set_value('ai_input', ...)` to not actually mutate the controller's `ai_input` state, and fix it the same way `_UI_FLAG_DEFAULTS` was fixed. +5. **Goal E: Test files assert against fresh state.** Add a `_reset_controller_state` fixture that any test can opt into via autouse-on-marker (`@pytest.mark.clean_baseline`). +6. **Goal F: Verify all 4 upcoming tracks have a clean test bed.** Run the full tier-1 + tier-2 + tier-3 batch and document which tests pass in batch vs. isolation. The 4 upcoming tracks (qwen_llama_grok, data_oriented_error_handling, data_structure_strengthening, mcp_architecture_refactor) start with a known green baseline. + +### Non-Goals (Out of Scope) + +- ❌ Refactoring the `live_gui` fixture to per-file scope (Solution A in `batch_resilience_plan_20260608.md`). Solution D (autouse health check + respawn) is the surgical alternative; per-file is too coarse. +- ❌ Refactoring `src/rag_engine.py` to a chunk-based data structure (that's the `chunkification_optimization_20260608_PLACEHOLDER` track). +- ❌ Migrating `live_gui` tests to mock-based tests (preserves the integration value). +- ❌ Adding CI infrastructure (this repo has no CI; manual batch runs are the verification). +- ❌ Fixing the 7 mock_app tests in `test_z_negative_flows.py` (separate code path; deferred). +- ❌ Fixing the 5 MMA pipeline tests that don't reach "tracks" state (separate code path; deferred). +- ❌ Fixing the `auto_switch_sim` test (separate code path; deferred). +- ❌ Doing the `code_path_audit_20260607` work (post-4-tracks; the audit is the post-condition). + +--- + +## Functional Requirements + +### FR1. Per-test subprocess health check + respawn + +**Where:** `tests/conftest.py:282` (the `live_gui` fixture) + +**What:** Add an autouse fixture that runs AFTER `live_gui` and BEFORE each test that uses it. The fixture: +1. Calls `client.get_gui_health()` with a 1s timeout. +2. If health is "degraded" OR the response is None OR the call raises, calls `_respawn_subprocess()`. +3. After respawn (or if health was already OK), verifies the subprocess is alive via the existing `kill_process_tree` machinery. + +**API:** +```python +@pytest.fixture(autouse=True) +def _check_live_gui_health(request, live_gui): + if "live_gui" in request.fixturenames: + handle, _ = live_gui + handle.ensure_alive() # does the health check + respawn + yield +``` + +**Tests required:** +- `test_live_gui_respawn_after_kill`: kill the subprocess via the handle, run a no-op test that uses `live_gui`, assert the subprocess is alive at test end. +- `test_live_gui_health_check_fast_path`: when the subprocess is alive, the health check is <100ms. +- `test_live_gui_no_respawn_on_clean`: when the subprocess is alive AND `get_gui_health()` returns OK, no respawn happens (verify via a `respawn_count` counter on the handle). + +### FR2. Expose `live_gui_workspace` as a separate fixture + +**Where:** `tests/conftest.py:282` (the `live_gui` fixture), plus 6 test files + +**What:** +1. Change `live_gui` to create the workspace via `tmp_path_factory.mktemp("live_gui_workspace")` instead of `Path("tests/artifacts/live_gui_workspace")`. +2. Add a new fixture `live_gui_workspace` that yields the absolute path to the workspace. +3. The `live_gui` fixture uses `chdir` (or sets the subprocess CWD) to the absolute path; the subprocess inherits the correct CWD. +4. Update 6 test files to accept `live_gui_workspace` as a fixture parameter and use the absolute path instead of the hardcoded one. + +**Tests required:** +- `test_live_gui_workspace_is_absolute`: assert the workspace path is absolute. +- `test_live_gui_workspace_unique_per_session`: assert two consecutive sessions get different workspace dirs (per-session `mktemp` returns unique dirs). +- `test_live_gui_workspace_passed_to_test`: parametrize a test with `live_gui_workspace`, assert the test can create files in it. + +**Files to update:** +- `tests/conftest.py:412` — replace `Path("tests/artifacts/live_gui_workspace")` with `tmp_path_factory.mktemp("live_gui_workspace")` +- `tests/test_rag_phase4_final_verify.py:20` — accept `live_gui_workspace` fixture +- `tests/test_rag_phase4_stress.py:21` — accept `live_gui_workspace` fixture +- `tests/test_saved_presets_sim.py:14, 121` — accept `live_gui_workspace` fixture +- `tests/test_tool_presets_sim.py:13` — accept `live_gui_workspace` fixture +- `tests/test_visual_sim_gui_ux.py:79` — accept `live_gui_workspace` fixture + +### FR3. Coalesce `_sync_rag_engine` calls + +**Where:** `src/app_controller.py:_sync_rag_engine` (or the setter that triggers it) + +**What:** Replace the immediate-submit pattern with a debounce/coalesce pattern. Multiple setters within a 100ms window produce ONE sync, run on the next idle moment. + +**Approach:** Add a `_rag_sync_token: Optional[int]` and a `_rag_sync_dirty: bool` flag. When a setter mutates `rag_config`, increment the token and set dirty. A background "sync dispatcher" task (or a deferred submit) reads the token, builds the engine once, sets the engine, and clears the flag. If a new setter comes in while a sync is running, increment the token, set dirty, the running sync sees the new token and re-runs once. + +**Tests required:** +- `test_sync_rag_engine_coalesces_five_setters`: fire 5 setters in 50ms, assert only 1 `RAGEngine()` is constructed. +- `test_sync_rag_engine_rerun_on_token_change`: while a sync is running, fire a setter; assert the sync sees the new token and re-runs once. +- `test_sync_rag_engine_idempotent_no_changes`: if no setters fire, no sync runs. + +### FR4. Fix `set_value` hook routing for `ai_input` + +**Where:** `src/gui_2.py:__setattr__` (or `src/app_controller.py:_handle_set_value`) + +**What:** Investigate the `__setattr__` / `__setstate__` chain. The test (`tests/test_gui2_set_value_hook_works`) calls `client.set_value('ai_input', 'hello')`, which posts to `/api/gui/set_value`, which calls `controller.`. The method either doesn't actually mutate `ai_input` or routes the value to a different attribute (similar to how `_UI_FLAG_DEFAULTS` was incorrectly returning `None`). + +**Likely root cause:** Either: +- The `__setattr__` allowlist only includes certain `ui_` attrs, and `ai_input` is not on it, so the assignment is silently dropped. +- The `/api/gui/set_value` endpoint has a `field != 'ai_input'` branch that doesn't call the setter. + +**Tests required:** +- `test_set_value_hook_ai_input`: assert that after `set_value('ai_input', 'hello')` and a 0.5s wait, `get_value('ai_input')` returns `'hello'`. +- `test_set_value_hook_temperature`: same for `temperature`. +- `test_set_value_hook_persists`: same for `model_name`. + +**Diagnostic test (write first):** A test that introspects the controller's `__dict__` and the API hook's parameter-to-handler mapping to find the missing branch. + +### FR5. Optional clean-baseline marker + +**Where:** `tests/conftest.py` (new fixture), test files that want it + +**What:** Add a `@pytest.mark.clean_baseline` marker. An autouse fixture detects the marker and calls a `_reset_controller_state` method on the controller before the test starts. The reset clears: `ai_input`, `ai_status`, `ai_response`, `current_provider`, `current_model`, `rag_config`, `files`, `mma_streams`, `mma_epic_input`, `mma_proposed_tracks`, plus any field set by a prior test. + +**API:** +```python +@pytest.fixture(autouse=True) +def _clean_baseline(request, live_gui): + if request.node.get_closest_marker("clean_baseline"): + handle, _ = live_gui + handle.client.reset_session() # existing endpoint, plus extended reset + yield +``` + +**Tests required:** +- `test_clean_baseline_resets_ai_input`: set `ai_input='polluted'`, mark test with `clean_baseline`, assert `ai_input` is `''` at test start. +- `test_clean_baseline_resets_rag_config`: same for `rag_config`. + +### FR6. Verify the 4 upcoming tracks have a clean test bed + +**Where:** `scripts/run_tests_batched.py` (no changes); verification in this track's final phase + +**What:** Run the full tier-1 + tier-2 + tier-3 batch and document which tests pass. Produce a "test bed health report" as a markdown file in `docs/reports/test_bed_health_20260609.md`. The report lists: +- Tier-1 unit tests: all pass (already verified in `rag_work_final_20260609_pm.md`) +- Tier-2 mock_app tests: all pass +- Tier-3 live_gui tests: pass/fail per file, with the failure mode +- A "before" / "after" diff so the user can see the impact + +--- + +## Non-Functional Requirements + +- **NFR1: Per-test overhead < 200ms.** The autouse `_check_live_gui_health` fixture must add <200ms to each test that uses `live_gui`. The 49 live_gui tests × 200ms = 9.8s additional batch time. Acceptable. +- **NFR2: No regressions in tier-1 / tier-2.** All unit tests and mock_app tests must continue to pass. The fixture change is additive, not destructive. +- **NFR3: Backward compat for tests that don't opt in.** Tests that don't use `live_gui` are unaffected. Tests that use `live_gui` but don't opt into `clean_baseline` continue to work (they just don't get a reset). +- **NFR4: No hardcoded paths to C:/projects/manual_slop or ./tests/artifacts/ in production code.** The track's filesystem-hygiene fix is *enforced* by the existing `scripts/check_test_toml_paths.py` audit (extended to also catch `Path("tests/artifacts/")` and `Path("C:/projects/")` in test files). +- **NFR5: 1-space indentation.** All Python code in this track uses 1-space indentation per `conductor/product-guidelines.md`. +- **NFR6: CRLF line endings on Windows.** All Python files in this track use CRLF. + +--- + +## Architecture Reference + +This track touches the following subsystems (see linked deep-dive guides): + +- **Test infrastructure:** `tests/conftest.py`, `scripts/run_tests_batched.py`. See [docs/guide_testing.md](../docs/guide_testing.md) §"7 conftest fixtures" and §"Puppeteer pattern". +- **AppController state delegation:** `src/app_controller.py` (166KB). See [docs/guide_app_controller.md](../docs/guide_app_controller.md) §"_predefined_callbacks / _gettable_fields Hook API registries" and [docs/guide_state_lifecycle.md](../docs/guide_state_lifecycle.md) §"State Delegation (__getattr__/__setattr__)". +- **RAG engine:** `src/rag_engine.py`. See [docs/guide_rag.md](../docs/guide_rag.md) §"RAGEngine lifecycle" and §"Sync to controller". +- **Hook API:** `src/api_hooks.py` + `src/api_hook_client.py`. See [docs/guide_api_hooks.md](../docs/guide_api_hooks.md) §"/api/gui/set_value" and §"Remote Confirmation Protocol". +- **io_pool:** `src/app_controller.py:_io_pool`. See [docs/guide_architecture.md](../docs/guide_architecture.md) §"Thread domains". + +### Key design constraints inherited + +- **Defer-not-catch pattern:** `imgui.*` calls before ImGui is ready crash at the C level (0xc0000005). The `_check_live_gui_health` fixture must NOT touch ImGui directly. It uses the existing Hook API (`/api/gui_health`, `/api/status`) which runs in the hook server thread, not the render thread. +- **Session-scoped fixture:** `live_gui` is session-scoped by design. Per-file or per-test scoping would break cross-test state (e.g., `test_full_live_workflow` expects a fresh `live_gui`, but `test_rag_phase4_stress` depends on the same subprocess the prior 4 sims used). The autouse respawn is the surgical solution. +- **tmp_path_factory scope:** `tmp_path_factory.mktemp()` is session-scoped (per the pytest docs). Per-test `tmp_path` is a different fixture. The `live_gui_workspace` fixture must use `tmp_path_factory` to be consistent with the session-scoped `live_gui`. + +### Key prior decisions to respect + +- The `_UI_FLAG_DEFAULTS` allowlist was a HARD-CODED set. The new `set_value` hook fix should follow the same allowlist pattern (consistency with the existing fix) OR use a class-level attribute that derives from `__init__` annotations (the better fix, but the user has not asked for the better fix; this track stays surgical). +- The existing `run_tests_batched.py` tier structure (tier-1 unit, tier-2 mock_app, tier-3 live_gui, tier-H headless, tier-P perf) is NOT to be restructured. The track works WITH the existing tier structure. +- The `audit_main_thread_imports.py` and `audit_weak_types.py` static CI gates are the project's enforcement mechanism. The new `Path("tests/artifacts/")` and `Path("C:/projects/")` patterns are added to `check_test_toml_paths.py` (extended) as a third gate. + +--- + +## Out of Scope + +The following are explicitly NOT part of this track. They are mentioned so the user knows they are deferred, not forgotten: + +1. **Per-file `live_gui` fixture scope (Solution A from `batch_resilience_plan_20260608.md`):** Not needed if the per-test autouse respawn works. May revisit if the per-test respawn has too much overhead. +2. **Refactoring `live_gui` fixture to a class-based handle with respawn (Solution B):** Same — only do if per-test respawn is insufficient. +3. **MMA pipeline tests that don't reach "tracks" state:** 3 tests fail in this pattern (`test_mma_concurrent_tracks_execution`, `test_mma_step_mode_approval_flow`, `test_mma_complete_lifecycle`). These are MMA-engine-state-transition bugs, not test-isolation bugs. Out of scope. +4. **Negative-flows tests (`test_z_negative_flows.py`):** 3 tests fail in this pattern. They exercise the mock provider's error path. Pre-existing, separate code path. Out of scope. +5. **`test_auto_switch_sim`:** Workspace auto-switch logic not applying Tier 3 profile. Pre-existing, separate code path. Out of scope. +6. **`test_prior_session_no_pop_imbalance`:** Already addressed in `live_gui_test_hardening_v2` (commit `26e0ced4`). Verify it still passes. +7. **`code_path_audit_20260607`:** Post-4-tracks audit. This track unblocks the 4 tracks; the audit runs after. +8. **`chunkification_optimization_20260608_PLACEHOLDER`:** The comms.log chunkification. Out of scope; the user has not approved it. +9. **`manual_ux_validation_20260608_PLACEHOLDER`:** The ASCII-sketch workflow. Out of scope; the user has not approved it. +10. **CI infrastructure:** No CI in this repo. Manual batch runs are the verification. + +--- + +## Verification Criteria + +This track is "done" when ALL of the following are true: + +1. ✅ All tier-1 unit tests pass in batch (no regression). +2. ✅ All tier-2 mock_app tests pass in batch (no regression). +3. ✅ The 6 test files that hardcoded `Path("tests/artifacts/live_gui_workspace")` now use the `live_gui_workspace` fixture. +4. ✅ `test_rag_phase4_final_verify.py::test_phase4_final_verify` passes in BATCH (after 4 sims) — the primary symptom the user wanted fixed. +5. ✅ `test_rag_phase4_stress.py` passes in batch OR has a documented reason for the residual flakiness (acceptable per `rag_work_final_20260609_pm.md`'s "out of scope" decision IF the io_pool race fix in FR3 lands). +6. ✅ `test_gui2_set_value_hook_works` passes in batch. +7. ✅ The autouse `_check_live_gui_health` fixture is in place; a new test (`test_live_gui_respawn_after_kill`) verifies it. +8. ✅ The `_sync_rag_engine` coalescing fix is in place; a new test (`test_sync_rag_engine_coalesces_five_setters`) verifies it. +9. ✅ A `docs/reports/test_bed_health_20260609.md` report is committed, listing pass/fail per test file with the failure mode for any residual failures. +10. ✅ `scripts/check_test_toml_paths.py` is extended to flag `Path("tests/artifacts/")` and `Path("C:/projects/")` in test files; the audit passes. + +--- + +## Risk Assessment + +| Risk | Likelihood | Impact | Mitigation | +|---|---|---|---| +| Per-test respawn adds too much overhead (>200ms × 49 tests = 10s) | Medium | Low | Verify with the NFR1 measurement; if exceeded, fall back to per-batch respawn | +| Per-test respawn breaks cross-test state dependencies | Medium | High | Add a `--no-respawn` pytest flag for tests that need cross-test state; audit the 49 live_gui tests for state dependencies before Phase 1 | +| `tmp_path_factory.mktemp` changes the workspace path, breaking the on-disk chroma DB persistence assumption | High | Low | Clear `.slop_cache/` dirs at session start; OR add a `live_gui_workspace_persist` opt-in | +| `_sync_rag_engine` coalescing breaks the existing RAG test that DEPENDS on multiple parallel syncs (unlikely) | Low | Medium | Write the FR3 tests to verify both "5 setters → 1 sync" AND "single setter → single sync" still work | +| `set_value` hook fix changes behavior for existing tests that assert on the OLD (broken) behavior | Low | High | Run the full tier-3 batch in Phase 3 and verify no regressions | +| The `tmp_path_factory.mktemp` refactor corrupts `tests/conftest.py` (the previous attempt at this refactor DID corrupt it; commit was reverted per `rag_test_batch_failure_status_20260609_pm3.md`) | High | High | Use `git stash` before each edit; if edit fails, `git stash pop` and try again with `manual-slop_set_file_slice` (which is the recommended surgical tool per `conductor/edit_workflow.md`) | + +--- + +## Phases (summary) + +This spec is the entry point. The plan (`plan.md`) breaks these into TDD-ready tasks. + +| Phase | Scope | Effort | +|---|---|---| +| Phase 1 | Audit: enumerate all `live_gui` cross-test state dependencies, document baseline failure modes | 1 day | +| Phase 2 | FR1: Per-test subprocess health check + respawn (autouse fixture) | 1 day | +| Phase 3 | FR2: Expose `live_gui_workspace` as a separate fixture, update 6 test files | 1 day | +| Phase 4 | FR3: Coalesce `_sync_rag_engine` calls (token + dirty flag pattern) | 1 day | +| Phase 5 | FR4: Fix `set_value` hook routing for `ai_input` | 1 day | +| Phase 6 | FR5: Optional `clean_baseline` marker | 0.5 day | +| Phase 7 | FR6: Run full batch, produce test_bed_health report | 0.5 day | +| Phase 8 | Docs: update `docs/guide_testing.md` + `docs/guide_state_lifecycle.md` | 0.5 day | + +Total: 6.5 days (fits within 1 sprint). + +--- + +## See Also + +- **Foundation:** [docs/reports/test_infra_hardening_foundation_20260608.md](../docs/reports/test_infra_hardening_foundation_20260608.md) — original 5-phase plan; this spec supersedes with sharper scope. +- **Batch resilience:** [docs/reports/batch_resilience_plan_20260608.md](../docs/reports/batch_resilience_plan_20260608.md) — 4 solutions; this spec adopts Solution D (autouse respawn) as primary. +- **RAG failure status:** [docs/reports/rag_test_batch_failure_status_20260609_pm3.md](../docs/reports/rag_test_batch_failure_status_20260609_pm3.md) — the filesystem hygiene findings that drive FR2. +- **RAG final report:** [docs/reports/rag_work_final_20260609_pm.md](../docs/reports/rag_work_final_20260609_pm.md) — the io_pool race that drives FR3. +- **Process anti-patterns:** [conductor/workflow.md](../conductor/workflow.md) §"Process Anti-Patterns (Added 2026-06-09)" — the Deduction Loop and Report-Instead-of-Fix patterns this track is designed to prevent. +- **Edit workflow:** [conductor/edit_workflow.md](../conductor/edit_workflow.md) — the surgical tool guidance; the conftest refactor MUST use `manual-slop_set_file_slice` after the previous attempt was reverted due to corruption. +- **Architecture deep-dive:** [docs/guide_testing.md](../docs/guide_testing.md) §"7 conftest fixtures" + [docs/guide_state_lifecycle.md](../docs/guide_state_lifecycle.md) §"State Delegation". +- **4 upcoming tracks:** + - [qwen_llama_grok_integration_20260606](../conductor/tracks/qwen_llama_grok_integration_20260606/) — spec ✓ + - [data_oriented_error_handling_20260606](../conductor/tracks/data_oriented_error_handling_20260606/) — plan ✓ + - [data_structure_strengthening_20260606](../conductor/tracks/data_structure_strengthening_20260606/) — plan pending + - [mcp_architecture_refactor_20260606](../conductor/tracks/mcp_architecture_refactor_20260606/) — plan pending + +--- + +## Approval Required + +This spec requires user approval before the plan is written. Per the conductor workflow: + +> The spec is the agent's design intent — it explains WHY, not just WHAT. +> A plan for an unapproved spec is wasted effort. + +The user has asked for a track to "kill the test regression nightmare." This spec defines what "kill" means: 5 surgical fixes (FR1-FR5) + a verification report (FR6) that produces a clean test bed for the 4 upcoming tracks. If the user wants more aggressive scope (e.g., refactoring `live_gui` to per-file scope), revise the spec before approving. diff --git a/conductor/tracks/test_infrastructure_hardening_20260609/state.toml b/conductor/tracks/test_infrastructure_hardening_20260609/state.toml new file mode 100644 index 00000000..0c66bece --- /dev/null +++ b/conductor/tracks/test_infrastructure_hardening_20260609/state.toml @@ -0,0 +1,142 @@ +# Track state for test_infrastructure_hardening_20260609 +# Updated by Tier 2 Tech Lead as tasks complete + +[meta] +track_id = "test_infrastructure_hardening_20260609" +name = "Test Infrastructure Hardening (2026-06-09)" +status = "active" +current_phase = 0 +last_updated = "2026-06-09" + +[blocked_by] +# No blockers; this track is the foundation for the 4 upcoming tracks + +[blocks] +qwen_llama_grok_integration_20260606 = "planned in this track" +data_oriented_error_handling_20260606 = "planned in this track" +data_structure_strengthening_20260606 = "planned in this track" +mcp_architecture_refactor_20260606 = "planned in this track" +code_path_audit_20260607 = "planned in this track" + +[phases] +phase_1 = { status = "pending", checkpointsha = "", name = "Audit" } +phase_2 = { status = "pending", checkpointsha = "", name = "FR1: Per-test subprocess health check + respawn" } +phase_3 = { status = "pending", checkpointsha = "", name = "FR2: live_gui_workspace fixture + 6 test files" } +phase_4 = { status = "pending", checkpointsha = "", name = "FR3: Coalesce _sync_rag_engine calls" } +phase_5 = { status = "pending", checkpointsha = "", name = "FR4: Fix set_value hook for ai_input" } +phase_6 = { status = "pending", checkpointsha = "", name = "FR5: Optional clean_baseline marker" } +phase_7 = { status = "pending", checkpointsha = "", name = "FR6: Test bed health report" } +phase_8 = { status = "pending", checkpointsha = "", name = "Docs + audit script extension" } + +[tasks] +# Phase 1: Audit +t1_1_1 = { status = "pending", commit_sha = "", description = "Enumerate live_gui test cross-file state dependencies" } +t1_1_2 = { status = "pending", commit_sha = "", description = "Document set_value/get_value/reset_session per test" } +t1_1_3 = { status = "pending", commit_sha = "", description = "Categorize self-contained vs cross-test-dependent" } +t1_2_1 = { status = "pending", commit_sha = "", description = "Find hardcoded tests/artifacts/live_gui_workspace references" } +t1_2_2 = { status = "pending", commit_sha = "", description = "Find Path('C:/projects/') references in tests" } +t1_3_1 = { status = "pending", commit_sha = "", description = "Read _sync_rag_engine and its callers" } +t1_3_2 = { status = "pending", commit_sha = "", description = "Write sync_rag_race.md audit" } +t1_4_1 = { status = "pending", commit_sha = "", description = "Read /api/gui/set_value endpoint" } +t1_4_2 = { status = "pending", commit_sha = "", description = "Read __setattr__ and _UI_FLAG_DEFAULTS allowlist" } +t1_4_3 = { status = "pending", commit_sha = "", description = "Diagnostic test of set_value('ai_input')" } +t1_4_4 = { status = "pending", commit_sha = "", description = "Write set_value_hook.md audit" } + +# Phase 2: FR1 +t2_1_1 = { status = "pending", commit_sha = "", description = "Pre-edit checkpoint (git stash)" } +t2_1_2 = { status = "pending", commit_sha = "", description = "Read existing live_gui fixture" } +t2_1_3 = { status = "pending", commit_sha = "", description = "Add _LiveGuiHandle class to conftest.py" } +t2_1_4 = { status = "pending", commit_sha = "", description = "Refactor live_gui fixture to use handle" } +t2_1_5 = { status = "pending", commit_sha = "", description = "Update all 49 live_gui test files to use new API" } +t2_1_6 = { status = "pending", commit_sha = "", description = "Run representative test to verify refactor" } +t2_1_7 = { status = "pending", commit_sha = "", description = "Commit refactor" } +t2_2_1 = { status = "pending", commit_sha = "", description = "Write failing test in tests/test_live_gui_respawn.py" } +t2_2_2 = { status = "pending", commit_sha = "", description = "Confirm test FAILS" } +t2_2_3 = { status = "pending", commit_sha = "", description = "Add autouse _check_live_gui_health fixture" } +t2_2_4 = { status = "pending", commit_sha = "", description = "Confirm test PASSES" } +t2_2_5 = { status = "pending", commit_sha = "", description = "Run full tier-3 batch to verify no regression" } +t2_2_6 = { status = "pending", commit_sha = "", description = "Commit autouse fixture" } + +# Phase 3: FR2 +t3_1_1 = { status = "pending", commit_sha = "", description = "Pre-edit checkpoint" } +t3_1_2 = { status = "pending", commit_sha = "", description = "Refactor live_gui to use tmp_path_factory.mktemp" } +t3_1_3 = { status = "pending", commit_sha = "", description = "Verify fixture still spawns correctly" } +t3_1_4 = { status = "pending", commit_sha = "", description = "Verify workspace is in tmp dir" } +t3_1_5 = { status = "pending", commit_sha = "", description = "Commit tmp_path_factory refactor" } +t3_2_1 = { status = "pending", commit_sha = "", description = "Write failing test in tests/test_live_gui_workspace_fixture.py" } +t3_2_2 = { status = "pending", commit_sha = "", description = "Confirm test FAILS" } +t3_2_3 = { status = "pending", commit_sha = "", description = "Add live_gui_workspace fixture" } +t3_2_4 = { status = "pending", commit_sha = "", description = "Confirm test PASSES" } +t3_2_5 = { status = "pending", commit_sha = "", description = "Commit live_gui_workspace fixture" } +t3_3_1 = { status = "pending", commit_sha = "", description = "Read each of 6 test files, identify hardcoded reference" } +t3_3_2 = { status = "pending", commit_sha = "", description = "Refactor 6 test files to use fixture (one at a time)" } +t3_3_3 = { status = "pending", commit_sha = "", description = "Run each updated test file in isolation" } +t3_3_4 = { status = "pending", commit_sha = "", description = "Run in batch to verify the RAG test passes after 4 sims" } +t3_3_5 = { status = "pending", commit_sha = "", description = "Commit 6-file refactor" } + +# Phase 4: FR3 +t4_1_1 = { status = "pending", commit_sha = "", description = "Read existing _sync_rag_engine and setters" } +t4_1_2 = { status = "pending", commit_sha = "", description = "Add coalescing state to AppController.__init__" } +t4_1_3 = { status = "pending", commit_sha = "", description = "Write failing test in tests/test_sync_rag_engine_coalescing.py" } +t4_1_4 = { status = "pending", commit_sha = "", description = "Confirm test FAILS" } +t4_1_5 = { status = "pending", commit_sha = "", description = "Refactor _sync_rag_engine to use token + dirty flag" } +t4_1_6 = { status = "pending", commit_sha = "", description = "Confirm test PASSES" } +t4_1_7 = { status = "pending", commit_sha = "", description = "Run RAG test in batch to verify race is fixed" } +t4_1_8 = { status = "pending", commit_sha = "", description = "Commit io_pool race fix" } + +# Phase 5: FR4 +t5_1_1 = { status = "pending", commit_sha = "", description = "Read failing test test_gui2_set_value_hook_works.py" } +t5_1_2 = { status = "pending", commit_sha = "", description = "Run test to confirm failure" } +t5_1_3 = { status = "pending", commit_sha = "", description = "Trace flow with diagnostic (in tests/artifacts, not src/)" } +t5_2_1 = { status = "pending", commit_sha = "", description = "Apply surgical fix" } +t5_2_2 = { status = "pending", commit_sha = "", description = "Verify test passes" } +t5_2_3 = { status = "pending", commit_sha = "", description = "Commit set_value fix" } + +# Phase 6: FR5 +t6_1_1 = { status = "pending", commit_sha = "", description = "Add clean_baseline marker to pyproject.toml" } +t6_1_2 = { status = "pending", commit_sha = "", description = "Write failing test in tests/test_clean_baseline_marker.py" } +t6_1_3 = { status = "pending", commit_sha = "", description = "Confirm test FAILS" } +t6_1_4 = { status = "pending", commit_sha = "", description = "Add autouse _reset_clean_baseline fixture" } +t6_1_5 = { status = "pending", commit_sha = "", description = "Confirm test PASSES" } +t6_1_6 = { status = "pending", commit_sha = "", description = "Commit clean_baseline marker" } + +# Phase 7: FR6 +t7_1_1 = { status = "pending", commit_sha = "", description = "Run tier-1 unit tests" } +t7_1_2 = { status = "pending", commit_sha = "", description = "Run tier-2 mock_app tests" } +t7_1_3 = { status = "pending", commit_sha = "", description = "Run tier-3 live_gui tests" } +t7_1_4 = { status = "pending", commit_sha = "", description = "Summarize pass/fail" } +t7_2_1 = { status = "pending", commit_sha = "", description = "Write docs/reports/test_bed_health_20260609.md" } +t7_2_2 = { status = "pending", commit_sha = "", description = "Commit test_bed_health report" } + +# Phase 8: Docs + audit +t8_1_1 = { status = "pending", commit_sha = "", description = "Read existing check_test_toml_paths.py" } +t8_1_2 = { status = "pending", commit_sha = "", description = "Add new patterns to audit script" } +t8_1_3 = { status = "pending", commit_sha = "", description = "Run audit to verify 0 violations" } +t8_1_4 = { status = "pending", commit_sha = "", description = "Write TDD test for the audit" } +t8_1_5 = { status = "pending", commit_sha = "", description = "Confirm test PASSES" } +t8_1_6 = { status = "pending", commit_sha = "", description = "Commit audit extension" } +t8_2_1 = { status = "pending", commit_sha = "", description = "Read existing guide_testing.md" } +t8_2_2 = { status = "pending", commit_sha = "", description = "Add §8 Per-test subprocess resilience" } +t8_2_3 = { status = "pending", commit_sha = "", description = "Commit docs update" } + +[verification] +phase_1_audits_committed = false +phase_2_respawn_fixture_works = false +phase_3_rag_test_passes_in_batch = false +phase_4_io_pool_race_fixed = false +phase_5_set_value_works_in_batch = false +phase_6_clean_baseline_marker_works = false +phase_7_test_bed_health_report_committed = false +phase_8_docs_and_audit_extended = false + +[baseline_capture] +# Captured in Phase 0 of the plan +# Will be populated by Tier 2 before Phase 1 begins +tier_1_status = "TBD" +tier_2_status = "TBD" +tier_3_status = "TBD" +batch_log = "TBD" + +[user_corrections_log] +# Record user-corrections here as the track progresses +# Format: phase_num, original_claim, correction, reason