diff --git a/conductor/tracks/test_batching_refactor_20260606/metadata.json b/conductor/tracks/test_batching_refactor_20260606/metadata.json new file mode 100644 index 00000000..c2640ddc --- /dev/null +++ b/conductor/tracks/test_batching_refactor_20260606/metadata.json @@ -0,0 +1,77 @@ +{ + "track_id": "test_batching_refactor_20260606", + "name": "Test Batching Refactor", + "initialized": "2026-06-06", + "owner": "tier2-tech-lead", + "priority": "medium", + "status": "active", + "type": "developer tooling + diagnostic improvement", + "scope": { + "new_files": [ + "scripts/test_categorizer.py", + "scripts/test_batcher.py", + "scripts/pytest_collection_order.py", + "tests/test_categories.toml", + "tests/test_categorizer.py", + "tests/test_batcher.py" + ], + "modified_files": [ + "scripts/run_tests_batched.py", + "tests/conftest.py", + "pyproject.toml" + ], + "deleted_files_at_phase4": [ + "scripts/run_tests_batched.py.legacy" + ] + }, + "blocked_by": [], + "blocks": [], + "estimated_phases": 4, + "spec": "spec.md", + "plan": "plan.md", + "priority_order": "B (process isolation by fixture class) > A (subsystem diagnostic grouping) > C (xdist + live_gui session reuse)", + "tier_model": { + "0_opt_in": "test_clean_install.py, test_docker_build.py; one batch per file; runs only if env var set AND --include-opt-in passed", + "1_unit": "Pure unit tests (no live_gui/mock_app/app_instance); grouped by batch_group; pytest-xdist -n auto", + "2_mock_app": "Tests using mock_app or app_instance fixtures; grouped by batch_group; no xdist", + "3_live_gui": "All tests using live_gui fixture in ONE pytest invocation (session-scoped reuse)", + "H_headless": "Headless service tests; one pytest invocation", + "P_performance": "Performance/stress tests; runs last; one pytest invocation" + }, + "hybrid_classification": "Auto-infer by default from filename and AST fixture scan; tests/test_categories.toml provides hand-curated overrides for cross-cutting and ambiguous files. Registry always wins precedence.", + "architectural_invariant": "Every pytest subprocess invocation has a single, well-defined fixture profile. live_gui tests never share a pytest process with non-live_gui tests. Opt-in tests are gated on BOTH env var AND --include-opt-in CLI flag (defense in depth).", + "cli_surface": { + "default": "All tiers except opt-in (0) and performance (P); xdist enabled for tier 1", + "--tiers": "Comma-separated tier list to include (e.g. --tiers 1,2,3)", + "--include-opt-in": "Hard flag required IN ADDITION to env var to run opt-in tests", + "--plan": "Dry-run; print batch plan and exit", + "--audit": "List auto-inferred (unclassified) files; exit non-zero on hard errors", + "--no-xdist": "Disable pytest-xdist for tier 1 (debug aid)", + "--strict-markers": "Pass --strict-markers to pytest (catch marker typos)" + }, + "verification_criteria": [ + "scripts/test_categorizer.py::categorize_all returns 277+ CategoryRecords with no exceptions", + "scripts/test_batcher.py::plan is deterministic (same inputs -> same outputs)", + "All 277+ test files are correctly classified: live_gui / mock_app / unit / opt_in / performance", + "Cross-cutting files (test_gui_dag_beads, test_arch_boundary_phase*, etc.) are flagged with multiple subsystems in the report", + "--plan output matches the existing 4-at-a-time batching modulo opt-in gating", + "No live_gui test ever runs in the same pytest invocation as a non-live_gui test", + "Opt-in tests are skipped silently when env var is not set (no warning, no error)", + "Opt-in tests are skipped silently when --include-opt-in is not passed (env var alone is insufficient)", + "scripts/check_test_toml_paths.py still exits 0 (no real TOML references in tests)", + "Existing 273+ test suite passes when run via the new script in --tiers 1,2,3 mode", + "tests/test_categorizer.py and tests/test_batcher.py pass with >80% coverage", + "pytest_collection_order plugin is a no-op when no [[test_order]] entries exist (zero overhead)" + ], + "links": { + "backlog_entry": "conductor/tracks.md (to be added at top of Remaining Backlog)", + "current_script": "scripts/run_tests_batched.py", + "testing_guide": "docs/guide_testing.md", + "workflow_pitfalls": "conductor/workflow.md#known-pitfalls-2026-06-05", + "related_tracks": [ + "conductor/tracks/startup_speedup_20260606/", + "conductor/tracks/regression_fixes_20260605/", + "conductor/tracks/live_gui_test_hardening_v2_20260605/" + ] + } +} diff --git a/conductor/tracks/test_batching_refactor_20260606/spec.md b/conductor/tracks/test_batching_refactor_20260606/spec.md new file mode 100644 index 00000000..6de1d317 --- /dev/null +++ b/conductor/tracks/test_batching_refactor_20260606/spec.md @@ -0,0 +1,348 @@ +# Track: Test Batching Refactor + +**Status:** Active (spec approved 2026-06-06) +**Initialized:** 2026-06-06 +**Owner:** Tier 2 Tech Lead +**Priority:** Medium (developer ergonomics + diagnostic improvement; not a regression blocker) + +--- + +## 1. Problem Statement + +The current test batching script (`scripts/run_tests_batched.py`, 36 lines) groups test files alphabetically in chunks of 4 with `pytest --maxfail=10`. This produces three concrete failure modes: + +1. **Zero diagnostic signal on failure.** When batch 17 fails, the user sees four unrelated filenames and a traceback. There is no way to know which subsystem broke without re-running individual files. +2. **No awareness of `live_gui` session-scoped fixture.** The `conductor/workflow.md` Known Pitfalls (2026-06-05) explicitly document that `live_gui` is session-scoped and that tests assuming a clean ImGui state are fragile. The current script *accidentally* avoids cross-batch pollution (each batch is a fresh `subprocess.run`) but is one refactor away from breaking that. +3. **No awareness of opt-in tests.** `test_clean_install.py` and `test_docker_build.py` are gated on environment variables but have no marker-based enforcement; running the script on a fresh clone can spuriously invoke them. + +The script's 4-at-a-time batching also has the property that fast unit tests and slow live_gui tests can be mixed in the same pytest invocation if the order changes — the alphabetical sort happens to interleave them. + +## 2. Goals (Priority Order) + +| Priority | Goal | Rationale | +|---|---|---| +| **B (foundational)** | Process isolation by fixture class. live_gui never shares a pytest process with non-live_gui tests. | `live_gui` is session-scoped; mixing in the same `pytest` invocation causes state pollution. workflow.md 2026-06-05 gotchas are explicit. | +| **B (foundational)** | Opt-in tests gated on env var, skipped silently otherwise. | `test_clean_install.py` clones the repo; `test_docker_build.py` builds an image. Running these by default is wrong. | +| **A (primary value)** | Diagnostic precision via subsystem grouping. When a batch fails, the report names the subsystem. | The user's stated complaint: "naive alphabetical groupings" provide no signal. | +| **A (primary value)** | Warn on unclassified files (registry miss), do not fail the run. | New tests should be flagged for human review without blocking the suite. | +| **C (optimization)** | Tier-1 (unit) parallelism via `pytest-xdist`. | Pure unit tests are independent; xdist is a free 2-4x speedup there. | +| **C (optimization)** | Live-gui session reuse (all `*_sim.py` in one pytest invocation). | Each fresh `sloppy.py` startup costs ~15s. Reusing the session is the only way to keep live_gui runtime sane. | +| **Nice-to-have** | Opt-in per-test order control via the registry. | When test B is known to depend on test A's side effect, ordering matters. Optional; zero impact when unused. | + +### 2.1 Non-Goals + +- **Not** changing the underlying test framework (pytest stays). +- **Not** restructuring test files into subdirectories (the flat `tests/` layout is preserved). +- **Not** introducing new pytest markers on the test functions themselves. The categorization lives in a single registry file, not on the test code. +- **Not** making the script required for CI today. The existing `uv run pytest tests/ -v` invocation keeps working; this script is a developer ergonomics + diagnostic tool. + +## 3. Architecture + +### 3.1 Three-Tier Model (Fixture Class as Primary Axis) + +``` +tests/ + conftest.py # pytest plugin entry: registers collection_order plugin + test_categories.toml # hand-curated overrides + classification + artifacts/ # git-ignored; test outputs (unchanged) + logs/ # git-ignored; live_gui logs (unchanged) + *.py # test files (unchanged) + +scripts/ + run_tests_batched.py # REPLACED: now the orchestrator + pytest_collection_order.py # NEW: conftest-loaded plugin for opt-in order control + test_categorizer.py # NEW: classifier library (auto-infer + registry) + test_batcher.py # NEW: scheduler library (turn categories into batches) +``` + +The categorizer is a pure function: `categorize(filename) -> CategoryRecord`. The batcher is a pure function: `plan(categories, options) -> list[Batch]`. The script is the CLI shell that wires the two together and shells out to `pytest`. + +### 3.2 Data Model + +```python +from dataclasses import dataclass, field +from enum import Enum +from pathlib import Path + +class FixtureClass(str, Enum): + UNIT = "unit" + MOCK_APP = "mock_app" + LIVE_GUI = "live_gui" + HEADLESS = "headless" + OPT_IN = "opt_in" + PERFORMANCE = "performance" + +class Speed(str, Enum): + FAST = "fast" # <1s typical + MEDIUM = "medium" # 1-5s + SLOW = "slow" # 5-30s + VERY_SLOW = "very_slow" # >30s + +@dataclass(frozen=True) +class CategoryRecord: + filename: str + fixture_class: FixtureClass + subsystems: list[str] # 1..N; multi-subsystem for cross-cutting + speed: Speed + batch_group: str # groups files within a tier for sub-batching + notes: str = "" + # Per-test order (opt-in). Default empty dict means natural pytest order. + test_order: dict[str, int] = field(default_factory=dict) + # Provenance: where did the classification come from? + source: str = "auto" # "auto" | "registry" + warnings: list[str] = field(default_factory=list) +``` + +### 3.3 The Six Tiers (Batches = pytest Subprocess Invocations) + +| Tier | FixtureClass | Batch strategy | xdist | Max-fail | +|---|---|---|---|---| +| **0** | `OPT_IN` | One pytest invocation per file; runs only if env var is set. Skipped silently otherwise. | no | 1 | +| **1** | `UNIT` | Grouped by `batch_group` into ~5–8 pytest invocations. | `-n auto` | 10 | +| **2** | `MOCK_APP` | Grouped by `batch_group` into ~3–5 pytest invocations. | no (single App instance) | 5 | +| **3** | `LIVE_GUI` | **One pytest invocation for all live_gui files.** Session-scoped reuse. Sub-report groups by subsystem via `--co`-derived reporting (post-hoc, from collected test IDs). | no | 1 (session crash = nuke) | +| **H** | `HEADLESS` | One pytest invocation; all headless service tests together. | no | 5 | +| **P** | `PERFORMANCE` | One pytest invocation; runs last so failures don't block the main feedback loop. | no | 1 | + +The ordering is: **0 → 1 → 2 → 3 → H → P** (opt-in first, perf last). + +### 3.4 The Registry: `tests/test_categories.toml` + +```toml +# Schema for each [files.] entry: +# fixture_class = "unit" | "mock_app" | "live_gui" | "headless" | "opt_in" | "performance" +# subsystems = list of strings (subsystem tags; cross-cutting tests list 2+) +# speed = "fast" | "medium" | "slow" | "very_slow" +# batch_group = string (sub-batching key within a tier) +# notes = free text (optional) +# +# Opt-in per-test order: +# [[files..test_order]] +# test_id = "test_foo::test_bar" # pytest node ID +# order = 10 # lower runs first; tests without entries sort after entries + +# Cross-cutting GUI+DAG+Beads test (would be auto-classified as "gui" but actually +# touches 3 subsystems; registry overrides subsystems to be explicit) +[files.test_gui_dag_beads] +fixture_class = "live_gui" +subsystems = ["gui", "dag", "beads"] +speed = "slow" +batch_group = "gui" +notes = "Cross-cutting: drives GUI, asserts on DAG state, exercises Beads backend" + +# Architectural boundary test (auto-classification would be ambiguous) +[files.test_arch_boundary_phase1] +fixture_class = "unit" +subsystems = ["architecture"] +speed = "fast" +batch_group = "core" +notes = "Phase 1 of the arch-boundary refactor; no fixture dependencies" + +# Opt-in per-test order example +[[files.test_mma_ticket_actions.test_order]] +test_id = "test_mma_ticket_actions::test_blocked_ticket_does_not_execute" +order = 5 + +[[files.test_mma_ticket_actions.test_order]] +test_id = "test_mma_ticket_actions::test_priority_ordering" +order = 10 +``` + +**Precedence:** registry entries always win. An auto-inferred `fixture_class = "unit"` is replaced by `fixture_class = "mock_app"` if the registry says so. This makes the registry the single source of truth for everything it touches, and the auto-inference is a sensible default for everything else. + +### 3.5 Auto-Inference Rules + +Implemented in `scripts/test_categorizer.py::auto_classify()`. Evaluated in order; first match wins: + +| # | Rule | Match condition | Result | +|---|---|---|---| +| 1 | Opt-in filename | `test_clean_install` or `test_docker_build` prefix | `OPT_IN` | +| 2 | live_gui fixture | File contains `def test_.*\(live_gui\):` or `\(live_gui\)\s*[:,)]` regex match in source | `LIVE_GUI` | +| 3 | Mock app fixture | File references `mock_app` or `app_instance` (fixture name) | `MOCK_APP` | +| 4 | Headless service | File references headless-service fixtures (e.g. `headless_client`, `TestClient(app)`) | `HEADLESS` | +| 5 | Performance keyword | Filename matches `*perf*`, `*stress*`, `*phase_3_final*`, `*phase_4_stress*` | `PERFORMANCE` | +| 6 | Default | None of the above | `UNIT` | + +**Subsystem auto-inference:** Take the longest known subsystem prefix from a curated list. Known prefixes (alphabetical for stable ordering): `ai`, `api`, `arch`, `ast`, `async`, `auto`, `beads`, `bias`, `cache`, `cli`, `cmd`, `comms`, `conductor`, `context`, `cost`, `dag`, `deepseek`, `diff`, `discussion`, `event`, `execution`, `external`, `ext`, `fuzzy`, `gemini`, `gui`, `headless`, `history`, `hooks`, `hot`, `imgui`, `layout`, `live`, `log`, `mcp`, `markdown`, `minimax`, `mma`, `model`, `orchestrator`, `outline`, `parallel`, `patch`, `perf`, `persona`, `phase`, `pipeline`, `preset`, `prior`, `process`, `project`, `provider`, `rag`, `script`, `session`, `shader`, `sim`, `skeleton`, `slice`, `spawn`, `status`, `subagent`, `summary`, `symbol`, `sync`, `synthesis`, `system`, `takes`, `theme`, `thinking`, `ticket`, `tier4`, `tiered`, `token`, `tool`, `track`, `tree`, `ts`, `undo`, `usage`, `user`, `vendor`, `view`, `visual`, `vlogger`, `websocket`, `workflow`, `workspace`, `z`. + +**Speed auto-inference:** Read `.test_durations.json` if present (key = `::`, value = seconds). Aggregate by file (p95). Map: `<1s` → FAST, `<5s` → MEDIUM, `<30s` → SLOW, else VERY_SLOW. If no history file, default to MEDIUM. + +**Batch-group auto-inference:** Cluster subsystems into groups heuristically: +- `core` = `mcp`, `ai`, `context`, `api`, `dag`, `path`, `presets`, `personas`, `history`, `workspace`, `rag`, `beads`, `model`, `ast`, `async`, `cache`, `cli`, `cmd`, `fuzzy`, `hooks`, `log`, `markdown`, `orchestrator`, `outline`, `pipeline`, `project`, `provider`, `script`, `session`, `skeleton`, `slice`, `spawn`, `status`, `subagent`, `summary`, `symbol`, `sync`, `synthesis`, `system`, `takes`, `thinking`, `tier4`, `tiered`, `tool`, `track`, `tree`, `ts`, `usage`, `vendor`, `vlogger`, `websocket`, `workflow` +- `gui` = `gui`, `theme`, `imgui`, `layout`, `live`, `prior`, `visual`, `view`, `undo` +- `mma` = `mma`, `conductor`, `execution`, `ext`, `external`, `auto`, `manual`, `tier`, `arch`, `phase`, `process`, `z` +- `comms` = `comms`, `diff`, `patch`, `event`, `hot`, `process`, `shader` +- `headless` = `headless` + +Single-subsystem tests use that subsystem's group. Multi-subsystem tests default to the group of the FIRST subsystem in their list (registry override can correct). + +## 4. Components + +### 4.1 `scripts/test_categorizer.py` — Pure classifier + +```python +def auto_classify(path: Path, durations: dict[str, float] | None = None) -> CategoryRecord: ... +def load_registry(toml_path: Path) -> dict[str, dict]: ... +def merge_registry(auto: CategoryRecord, registry: dict) -> CategoryRecord: ... +def categorize_all(tests_dir: Path, registry_path: Path) -> list[CategoryRecord]: ... +``` + +Public API. No I/O at import time. Reads registry lazily. The `categorize_all` function returns one `CategoryRecord` per test file in `tests/`. Each record's `source` field is `"registry"` if the registry had any matching entry, else `"auto"`. Each record's `warnings` field is populated with any inconsistencies detected (e.g., auto-inferred fixture_class differs from registry). + +### 4.2 `scripts/test_batcher.py` — Pure scheduler + +```python +@dataclass(frozen=True) +class Batch: + tier: str # "0", "1", "2", "3", "H", "P" + label: str # "tier-1-unit-core" + files: list[Path] + pytest_args: list[str] # e.g. ["-n", "auto", "--maxfail=10"] + estimated_seconds: float + skip_reason: str | None = None # populated for skipped opt-in batches + +def plan( + records: list[CategoryRecord], + *, + tiers: set[str] = {"0", "1", "2", "3", "H", "P"}, + include_opt_in: bool = False, + xdist: bool = True, +) -> list[Batch]: ... +``` + +The `plan` function is deterministic. The same `records` + same `options` produce the same `list[Batch]`. This makes the planner trivially testable and makes the `--plan` dry-run mode a one-liner. + +### 4.3 `scripts/run_tests_batched.py` — CLI orchestrator + +Responsibilities (slim, delegates everything else): +1. Parse CLI args (`--tiers`, `--include-opt-in`, `--plan`, `--audit`, `--no-xdist`). +2. Call `categorize_all(tests_dir, registry_path)`. +3. If `--audit`: print records where `source == "auto"`, exit non-zero if any have empty subsystem lists or other hard errors. Exit 0 if every record is well-formed even if some are auto-inferred. +4. If `--plan`: print the batch list (one row per batch with label, files, estimated seconds) and exit. +5. Otherwise: call `plan()`, iterate batches, run each as `subprocess.run(uv + pytest + pytest_args + files)`, accumulate per-batch results, print the summary table. +6. Return the worst per-batch exit code (0 only if all batches pass). + +The script is intentionally <150 lines. All logic lives in the two library modules. + +### 4.4 `scripts/pytest_collection_order.py` — Conftest-loaded plugin + +Hook: `pytest_collection_modifyitems(config, items)`. Reads `tests/test_categories.toml` once at session start, builds a `dict[str, int]` from `[[files..test_order]]` entries, then sorts items within each file by their order index. Items without an order index sort after items with one (preserves pytest's natural order for unannotated tests). + +Registered via `tests/conftest.py`: + +```python +pytest_plugins = ["scripts.pytest_collection_order"] +``` + +This is opt-in by design: if no `test_categories.toml` exists OR no `[[files.X.test_order]]` entries exist, the plugin is a no-op (zero items sorted, zero overhead). + +## 5. Output / Report Format + +After the run, the script prints a summary table: + +``` +[TIER 0] opt-in (clean_install) SKIPPED RUN_CLEAN_INSTALL_TEST not set +[TIER 0] opt-in (docker) SKIPPED RUN_DOCKER_TEST not set +[TIER 1] unit: core PASS 42/42 8.3s +[TIER 1] unit: gui PASS 17/17 2.1s +[TIER 1] unit: mma FAIL 12/13 1.8s ← test_mma_ticket_actions::test_x +[TIER 2] mock_app: core PASS 31/31 6.4s +[TIER 3] live_gui PASS 14/14 47.2s +[TIER H] headless PASS 3/3 4.0s +[TIER P] performance SKIPPED --tiers excludes P +[TOTAL] 5 tiers run, 119 tests, 70.0s, 1 failed +``` + +For Tier 3, the per-test failures are still in the regular pytest output (one pytest invocation); the summary line just reports the tier-level pass/fail. + +## 6. CLI Surface + +```powershell +# Default: all tiers except opt-in and performance; xdist on for tier 1 +python scripts/run_tests_batched.py + +# Skip slow/expensive stuff +python scripts/run_tests_batched.py --tiers 1,2 + +# Include opt-in tests (also requires the env var; the flag is a hard requirement +# so a CI run cannot accidentally enable them by exporting the env var) +python scripts/run_tests_batched.py --include-opt-in + +# Dry-run: show the batch plan, don't run anything +python scripts/run_tests_batched.py --plan + +# Audit: warn on unclassified (auto-inferred) files, list them, exit non-zero +python scripts/run_tests_batched.py --audit + +# Disable xdist (e.g., when debugging a test that flakes under parallelism) +python scripts/run_tests_batched.py --no-xdist + +# Override the tests directory or registry path +python scripts/run_tests_batched.py --tests-dir tests --registry tests/test_categories.toml +``` + +The `--include-opt-in` flag is **additive** to env var gating, not a replacement. A user must both set the env var AND pass the flag. This prevents accidental opt-in execution when an env var is set globally. + +## 7. Configuration + +### 7.1 `pyproject.toml` addition + +```toml +[tool.pytest.ini_options] +addopts = ["-ra", "--strict-markers"] # add strict-markers to catch typos +markers = [ + "integration: marks tests as integration tests (requires live GUI)", + "clean_install: clean install verification (opt-in via RUN_CLEAN_INSTALL_TEST=1)", + "docker: docker build and run test (opt-in via RUN_DOCKER_TEST=1)", +] +``` + +`--strict-markers` is opt-in via the script's `--strict-markers` flag, not added to `addopts` globally, to avoid breaking existing test runs that haven't been audited. + +### 7.2 `.test_durations.json` (auto-generated, git-ignored) + +Written by `run_tests_batched.py` after a successful run. Format: + +```json +{ + "tests/test_foo.py::test_bar": 0.043, + "tests/test_foo.py::test_baz": 1.234 +} +``` + +Used by the categorizer for `speed` auto-inference. If absent, all files default to MEDIUM speed (no batch reordering). Add `tests/.test_durations.json` to `.gitignore` (or place under `tests/artifacts/`). + +## 8. Migration / Rollout + +| Phase | What | Risk | +|---|---|---| +| **Phase 1 — Library + dry-run** | Add `test_categorizer.py`, `test_batcher.py`, `pytest_collection_order.py`. Add `--plan` and `--audit` modes to a NEW script (don't replace the old one yet). Run on a clean clone; manually verify the plan matches the existing 4-at-a-time behavior (modulo opt-in gating). | None. Old script untouched. | +| **Phase 2 — Shadow run** | Run the new script in CI as a non-blocking job (informational only). Compare its pass/fail signature to the old script's. Investigate any divergence. | Low. Old script still authoritative. | +| **Phase 3 — Switch default** | Replace the old `run_tests_batched.py` with the new one. Update `docs/guide_testing.md` to point at the new section. Keep the old script under `scripts/run_tests_batched.py.legacy` for one cycle. | Medium. Mitigation: Phase 2 shadow run. | +| **Phase 4 — Cleanup** | Delete the legacy script. Add the registry file (`tests/test_categories.toml`) populated with the ~30 cross-cutting / ambiguous files identified during audit. Mark the remaining files as auto-inferred in the report. | Low. | + +Each phase has its own implementation plan produced by the writing-plans skill. + +## 9. Risks & Mitigations + +| Risk | Likelihood | Impact | Mitigation | +|---|---|---|---| +| Auto-inference misclassifies a cross-cutting test, putting it in the wrong tier. | Medium | Medium (wrong fixture class could cause pollution) | `--audit` mode lists all auto-inferred records; CI gate on `--audit --strict` exits non-zero if any auto-classified file has multiple subsystems (a heuristic for "probably cross-cutting"). Registry overrides are one-line fixes. | +| Tier 3 (live_gui) shares one pytest process; one crash kills all live_gui tests for the run. | Low (existing behavior) | High (15s+ wasted + missing signal) | `--maxfail=1` for tier 3. Document the trade-off: faster average runtime, but a crash in one test forfeits the rest. | +| `pytest-xdist` introduces non-determinism in unit tests that share state via module globals. | Low | Medium | Audit scripts flag any unit test that mutates a module-level `src.*` global. Tests that do must be moved to Tier 2 (mock_app) or registered as `MOCK_APP` explicitly. | +| Speed auto-inference from `.test_durations.json` is stale. | Medium | Low (wrong `speed` field, not wrong tier) | `speed` affects only the summary table; tiers are determined by `fixture_class`. Stale speed data does not affect process isolation. | +| New tests added without a registry entry slip through unclassified. | Medium | Low | `--audit` mode warns; CI can gate on `--audit --strict` (planned for Phase 3). | +| `pytest_collection_order` plugin sorts items but tests have hard dependencies on collection order (e.g., shared module state). | Low | High | The plugin is opt-in per file. No `[[test_order]]` entries = natural pytest order. Document the contract in the plugin docstring. | + +## 10. Open Questions + +1. Should the registry live in `tests/` or at the repo root? (Proposal: `tests/test_categories.toml` so it lives next to the tests it describes.) +2. Should `batch_group` be inferred by default or required to be explicit? (Proposal: inferred by default; explicit in registry.) +3. Should we expose a `python scripts/run_tests_batched.py --tier 3 --file test_gui_dag_beads` mode for ad-hoc single-file runs? (Proposal: yes, defer to a follow-up plan.) +4. Should the speed auto-inference be updated incrementally (per run) or only on explicit `--record-durations` opt-in? (Proposal: per-run by default; the file is git-ignored so it's just a developer-local cache.) + +## 11. See Also + +- `docs/guide_testing.md` — current testing guide (will be updated in Phase 3 to reference the new script) +- `conductor/workflow.md` "Known Pitfalls (2026-06-05)" — `live_gui` session-scoped fixture gotchas +- `conductor/tracks/startup_speedup_20260606/` — example of a prior active track in this project (same convention) diff --git a/conductor/tracks/test_batching_refactor_20260606/state.toml b/conductor/tracks/test_batching_refactor_20260606/state.toml new file mode 100644 index 00000000..7b56d230 --- /dev/null +++ b/conductor/tracks/test_batching_refactor_20260606/state.toml @@ -0,0 +1,97 @@ +# Track state for test_batching_refactor_20260606 +# Updated by Tier 2 Tech Lead as tasks complete + +[meta] +track_id = "test_batching_refactor_20260606" +name = "Test Batching Refactor" +status = "active" +current_phase = 0 +last_updated = "2026-06-06" + +[phases] +# Phase 1: Library + dry-run (categorizer + batcher + plugin, --plan/--audit modes) +phase_1 = { status = "pending", checkpoint_sha = "", name = "Library + dry-run modes" } +# Phase 2: Shadow run (compare new vs old in CI, no behavior change) +phase_2 = { status = "pending", checkpoint_sha = "", name = "Shadow run + divergence check" } +# Phase 3: Switch default (replace old script, update guide_testing.md) +phase_3 = { status = "pending", checkpoint_sha = "", name = "Switch default + docs update" } +# Phase 4: Cleanup (populate registry, delete legacy, archive track) +phase_4 = { status = "pending", checkpoint_sha = "", name = "Registry population + legacy removal" } + +[tasks] +# Phase 1: Library + dry-run +# (Tasks TBD by writing-plans skill; placeholder structure only) +t1_1 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_auto_classify_opt_in_filename" } +t1_2 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_auto_classify_live_gui_fixture_scan" } +t1_3 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_auto_classify_mock_app_fixture_scan" } +t1_4 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_auto_classify_perf_keyword" } +t1_5 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_auto_classify_default_unit" } +t1_6 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_subsystem_inference_known_prefixes" } +t1_7 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_speed_inference_from_durations" } +t1_8 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_batch_group_inference" } +t1_9 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_merge_registry_overrides_auto" } +t1_10 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_categorize_all_277_files" } +t1_11 = { status = "pending", commit_sha = "", description = "Green: implement scripts/test_categorizer.py" } +t1_12 = { status = "pending", commit_sha = "", description = "Red: tests/test_batcher.py::test_plan_unit_tier_groups_by_batch_group" } +t1_13 = { status = "pending", commit_sha = "", description = "Red: tests/test_batcher.py::test_plan_live_gui_tier_one_invocation" } +t1_14 = { status = "pending", commit_sha = "", description = "Red: tests/test_batcher.py::test_plan_opt_in_skipped_without_flag" } +t1_15 = { status = "pending", commit_sha = "", description = "Red: tests/test_batcher.py::test_plan_deterministic" } +t1_16 = { status = "pending", commit_sha = "", description = "Red: tests/test_batcher.py::test_plan_xdist_only_for_tier_1" } +t1_17 = { status = "pending", commit_sha = "", description = "Green: implement scripts/test_batcher.py" } +t1_18 = { status = "pending", commit_sha = "", description = "Red: tests/test_pytest_collection_order.py::test_no_op_without_entries" } +t1_19 = { status = "pending", commit_sha = "", description = "Red: tests/test_pytest_collection_order.py::test_sorts_by_order_index" } +t1_20 = { status = "pending", commit_sha = "", description = "Green: implement scripts/pytest_collection_order.py" } +t1_21 = { status = "pending", commit_sha = "", description = "Wire pytest plugin in tests/conftest.py (pytest_plugins list)" } +t1_22 = { status = "pending", commit_sha = "", description = "Implement scripts/run_tests_batched.py with --plan and --audit modes only" } +t1_23 = { status = "pending", commit_sha = "", description = "Manually verify --plan output: all 277 files appear, tiers correctly assigned" } +t1_24 = { status = "pending", commit_sha = "", description = "Phase 1 checkpoint commit + git note" } +# Phase 2: Shadow run +t2_1 = { status = "pending", commit_sha = "", description = "Add CI workflow job: run new script in --tiers 1,2 mode; compare exit code to old script" } +t2_2 = { status = "pending", commit_sha = "", description = "Investigate any divergence; fix categorizer/batcher" } +t2_3 = { status = "pending", commit_sha = "", description = "Phase 2 checkpoint commit + git note" } +# Phase 3: Switch default +t3_1 = { status = "pending", commit_sha = "", description = "Add --include-opt-in and --tiers CLI handling to scripts/run_tests_batched.py" } +t3_2 = { status = "pending", commit_sha = "", description = "Add --durations record-on-success to scripts/run_tests_batched.py" } +t3_3 = { status = "pending", commit_sha = "", description = "Update docs/guide_testing.md 'Running Tests' section to reference new script" } +t3_4 = { status = "pending", commit_sha = "", description = "Rename old scripts/run_tests_batched.py to scripts/run_tests_batched.py.legacy" } +t3_5 = { status = "pending", commit_sha = "", description = "Phase 3 checkpoint commit + git note" } +# Phase 4: Cleanup +t4_1 = { status = "pending", commit_sha = "", description = "Run --audit on a clean clone; collect auto-inferred files" } +t4_2 = { status = "pending", commit_sha = "", description = "Populate tests/test_categories.toml with ~30 cross-cutting / ambiguous entries" } +t4_3 = { status = "pending", commit_sha = "", description = "Add tests/.test_durations.json to .gitignore" } +t4_4 = { status = "pending", commit_sha = "", description = "Delete scripts/run_tests_batched.py.legacy" } +t4_5 = { status = "pending", commit_sha = "", description = "Archive track: git mv conductor/tracks/test_batching_refactor_20260606/ conductor/tracks/archive/" } +t4_6 = { status = "pending", commit_sha = "", description = "Update conductor/tracks.md; move entry from Backlog to Recently Completed" } +t4_7 = { status = "pending", commit_sha = "", description = "Phase 4 checkpoint commit + git note" } + +[verification] +# Filled at Phase 4 +auto_classify_opt_in = false +auto_classify_live_gui = false +auto_classify_mock_app = false +auto_classify_perf = false +auto_classify_default_unit = false +subsystem_inference_known_prefixes = false +speed_inference_from_durations = false +batch_group_inference = false +merge_registry_overrides_auto = false +categorize_all_277_files = false +plan_unit_tier_groups_by_batch_group = false +plan_live_gui_tier_one_invocation = false +plan_opt_in_skipped_without_flag = false +plan_deterministic = false +plan_xdist_only_for_tier_1 = false +collection_order_no_op_without_entries = false +collection_order_sorts_by_order_index = false +plan_matches_4at_a_time = false +audit_exits_nonzero_on_hard_errors = false +opt_in_skipped_without_env_var = false +opt_in_skipped_without_include_flag = false +no_live_gui_in_same_invocation_as_others = false +existing_test_suite_passes = false +test_categorizer_coverage_pct = 0 +test_batcher_coverage_pct = 0 + +[registry_overrides] +# Populated in Phase 4 T4.2; one entry per cross-cutting or ambiguous file +# Format: {file = "test_X.py", fixture_class = "...", subsystems = ["a", "b"], notes = "..."}