Private
Public Access
0
0

conductor(track): Initialize test_batching_refactor_20260606 spec

Three-tier batching refactor: replace alphabetical 4-at-a-time batching with
fixture-class-isolated tiers (0 opt-in, 1 unit/xdist, 2 mock_app, 3 live_gui
in one session, H headless, P performance).

Hybrid classification: auto-infer from filename + AST fixture scan; hand-curated
tests/test_categories.toml overrides for cross-cutting and ambiguous files.

Opt-in per-test order control via [[files.X.test_order]] sub-tables, gated on
a conftest-loaded pytest plugin (no-op without entries).

Priority order: B (process isolation) > A (subsystem diagnostic) > C (speed).
This commit is contained in:
2026-06-06 14:12:14 -04:00
parent 96158edd97
commit b7a9737443
3 changed files with 522 additions and 0 deletions
@@ -0,0 +1,77 @@
{
"track_id": "test_batching_refactor_20260606",
"name": "Test Batching Refactor",
"initialized": "2026-06-06",
"owner": "tier2-tech-lead",
"priority": "medium",
"status": "active",
"type": "developer tooling + diagnostic improvement",
"scope": {
"new_files": [
"scripts/test_categorizer.py",
"scripts/test_batcher.py",
"scripts/pytest_collection_order.py",
"tests/test_categories.toml",
"tests/test_categorizer.py",
"tests/test_batcher.py"
],
"modified_files": [
"scripts/run_tests_batched.py",
"tests/conftest.py",
"pyproject.toml"
],
"deleted_files_at_phase4": [
"scripts/run_tests_batched.py.legacy"
]
},
"blocked_by": [],
"blocks": [],
"estimated_phases": 4,
"spec": "spec.md",
"plan": "plan.md",
"priority_order": "B (process isolation by fixture class) > A (subsystem diagnostic grouping) > C (xdist + live_gui session reuse)",
"tier_model": {
"0_opt_in": "test_clean_install.py, test_docker_build.py; one batch per file; runs only if env var set AND --include-opt-in passed",
"1_unit": "Pure unit tests (no live_gui/mock_app/app_instance); grouped by batch_group; pytest-xdist -n auto",
"2_mock_app": "Tests using mock_app or app_instance fixtures; grouped by batch_group; no xdist",
"3_live_gui": "All tests using live_gui fixture in ONE pytest invocation (session-scoped reuse)",
"H_headless": "Headless service tests; one pytest invocation",
"P_performance": "Performance/stress tests; runs last; one pytest invocation"
},
"hybrid_classification": "Auto-infer by default from filename and AST fixture scan; tests/test_categories.toml provides hand-curated overrides for cross-cutting and ambiguous files. Registry always wins precedence.",
"architectural_invariant": "Every pytest subprocess invocation has a single, well-defined fixture profile. live_gui tests never share a pytest process with non-live_gui tests. Opt-in tests are gated on BOTH env var AND --include-opt-in CLI flag (defense in depth).",
"cli_surface": {
"default": "All tiers except opt-in (0) and performance (P); xdist enabled for tier 1",
"--tiers": "Comma-separated tier list to include (e.g. --tiers 1,2,3)",
"--include-opt-in": "Hard flag required IN ADDITION to env var to run opt-in tests",
"--plan": "Dry-run; print batch plan and exit",
"--audit": "List auto-inferred (unclassified) files; exit non-zero on hard errors",
"--no-xdist": "Disable pytest-xdist for tier 1 (debug aid)",
"--strict-markers": "Pass --strict-markers to pytest (catch marker typos)"
},
"verification_criteria": [
"scripts/test_categorizer.py::categorize_all returns 277+ CategoryRecords with no exceptions",
"scripts/test_batcher.py::plan is deterministic (same inputs -> same outputs)",
"All 277+ test files are correctly classified: live_gui / mock_app / unit / opt_in / performance",
"Cross-cutting files (test_gui_dag_beads, test_arch_boundary_phase*, etc.) are flagged with multiple subsystems in the report",
"--plan output matches the existing 4-at-a-time batching modulo opt-in gating",
"No live_gui test ever runs in the same pytest invocation as a non-live_gui test",
"Opt-in tests are skipped silently when env var is not set (no warning, no error)",
"Opt-in tests are skipped silently when --include-opt-in is not passed (env var alone is insufficient)",
"scripts/check_test_toml_paths.py still exits 0 (no real TOML references in tests)",
"Existing 273+ test suite passes when run via the new script in --tiers 1,2,3 mode",
"tests/test_categorizer.py and tests/test_batcher.py pass with >80% coverage",
"pytest_collection_order plugin is a no-op when no [[test_order]] entries exist (zero overhead)"
],
"links": {
"backlog_entry": "conductor/tracks.md (to be added at top of Remaining Backlog)",
"current_script": "scripts/run_tests_batched.py",
"testing_guide": "docs/guide_testing.md",
"workflow_pitfalls": "conductor/workflow.md#known-pitfalls-2026-06-05",
"related_tracks": [
"conductor/tracks/startup_speedup_20260606/",
"conductor/tracks/regression_fixes_20260605/",
"conductor/tracks/live_gui_test_hardening_v2_20260605/"
]
}
}
@@ -0,0 +1,348 @@
# Track: Test Batching Refactor
**Status:** Active (spec approved 2026-06-06)
**Initialized:** 2026-06-06
**Owner:** Tier 2 Tech Lead
**Priority:** Medium (developer ergonomics + diagnostic improvement; not a regression blocker)
---
## 1. Problem Statement
The current test batching script (`scripts/run_tests_batched.py`, 36 lines) groups test files alphabetically in chunks of 4 with `pytest --maxfail=10`. This produces three concrete failure modes:
1. **Zero diagnostic signal on failure.** When batch 17 fails, the user sees four unrelated filenames and a traceback. There is no way to know which subsystem broke without re-running individual files.
2. **No awareness of `live_gui` session-scoped fixture.** The `conductor/workflow.md` Known Pitfalls (2026-06-05) explicitly document that `live_gui` is session-scoped and that tests assuming a clean ImGui state are fragile. The current script *accidentally* avoids cross-batch pollution (each batch is a fresh `subprocess.run`) but is one refactor away from breaking that.
3. **No awareness of opt-in tests.** `test_clean_install.py` and `test_docker_build.py` are gated on environment variables but have no marker-based enforcement; running the script on a fresh clone can spuriously invoke them.
The script's 4-at-a-time batching also has the property that fast unit tests and slow live_gui tests can be mixed in the same pytest invocation if the order changes — the alphabetical sort happens to interleave them.
## 2. Goals (Priority Order)
| Priority | Goal | Rationale |
|---|---|---|
| **B (foundational)** | Process isolation by fixture class. live_gui never shares a pytest process with non-live_gui tests. | `live_gui` is session-scoped; mixing in the same `pytest` invocation causes state pollution. workflow.md 2026-06-05 gotchas are explicit. |
| **B (foundational)** | Opt-in tests gated on env var, skipped silently otherwise. | `test_clean_install.py` clones the repo; `test_docker_build.py` builds an image. Running these by default is wrong. |
| **A (primary value)** | Diagnostic precision via subsystem grouping. When a batch fails, the report names the subsystem. | The user's stated complaint: "naive alphabetical groupings" provide no signal. |
| **A (primary value)** | Warn on unclassified files (registry miss), do not fail the run. | New tests should be flagged for human review without blocking the suite. |
| **C (optimization)** | Tier-1 (unit) parallelism via `pytest-xdist`. | Pure unit tests are independent; xdist is a free 2-4x speedup there. |
| **C (optimization)** | Live-gui session reuse (all `*_sim.py` in one pytest invocation). | Each fresh `sloppy.py` startup costs ~15s. Reusing the session is the only way to keep live_gui runtime sane. |
| **Nice-to-have** | Opt-in per-test order control via the registry. | When test B is known to depend on test A's side effect, ordering matters. Optional; zero impact when unused. |
### 2.1 Non-Goals
- **Not** changing the underlying test framework (pytest stays).
- **Not** restructuring test files into subdirectories (the flat `tests/` layout is preserved).
- **Not** introducing new pytest markers on the test functions themselves. The categorization lives in a single registry file, not on the test code.
- **Not** making the script required for CI today. The existing `uv run pytest tests/ -v` invocation keeps working; this script is a developer ergonomics + diagnostic tool.
## 3. Architecture
### 3.1 Three-Tier Model (Fixture Class as Primary Axis)
```
tests/
conftest.py # pytest plugin entry: registers collection_order plugin
test_categories.toml # hand-curated overrides + classification
artifacts/ # git-ignored; test outputs (unchanged)
logs/ # git-ignored; live_gui logs (unchanged)
*.py # test files (unchanged)
scripts/
run_tests_batched.py # REPLACED: now the orchestrator
pytest_collection_order.py # NEW: conftest-loaded plugin for opt-in order control
test_categorizer.py # NEW: classifier library (auto-infer + registry)
test_batcher.py # NEW: scheduler library (turn categories into batches)
```
The categorizer is a pure function: `categorize(filename) -> CategoryRecord`. The batcher is a pure function: `plan(categories, options) -> list[Batch]`. The script is the CLI shell that wires the two together and shells out to `pytest`.
### 3.2 Data Model
```python
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
class FixtureClass(str, Enum):
UNIT = "unit"
MOCK_APP = "mock_app"
LIVE_GUI = "live_gui"
HEADLESS = "headless"
OPT_IN = "opt_in"
PERFORMANCE = "performance"
class Speed(str, Enum):
FAST = "fast" # <1s typical
MEDIUM = "medium" # 1-5s
SLOW = "slow" # 5-30s
VERY_SLOW = "very_slow" # >30s
@dataclass(frozen=True)
class CategoryRecord:
filename: str
fixture_class: FixtureClass
subsystems: list[str] # 1..N; multi-subsystem for cross-cutting
speed: Speed
batch_group: str # groups files within a tier for sub-batching
notes: str = ""
# Per-test order (opt-in). Default empty dict means natural pytest order.
test_order: dict[str, int] = field(default_factory=dict)
# Provenance: where did the classification come from?
source: str = "auto" # "auto" | "registry"
warnings: list[str] = field(default_factory=list)
```
### 3.3 The Six Tiers (Batches = pytest Subprocess Invocations)
| Tier | FixtureClass | Batch strategy | xdist | Max-fail |
|---|---|---|---|---|
| **0** | `OPT_IN` | One pytest invocation per file; runs only if env var is set. Skipped silently otherwise. | no | 1 |
| **1** | `UNIT` | Grouped by `batch_group` into ~58 pytest invocations. | `-n auto` | 10 |
| **2** | `MOCK_APP` | Grouped by `batch_group` into ~35 pytest invocations. | no (single App instance) | 5 |
| **3** | `LIVE_GUI` | **One pytest invocation for all live_gui files.** Session-scoped reuse. Sub-report groups by subsystem via `--co`-derived reporting (post-hoc, from collected test IDs). | no | 1 (session crash = nuke) |
| **H** | `HEADLESS` | One pytest invocation; all headless service tests together. | no | 5 |
| **P** | `PERFORMANCE` | One pytest invocation; runs last so failures don't block the main feedback loop. | no | 1 |
The ordering is: **0 → 1 → 2 → 3 → H → P** (opt-in first, perf last).
### 3.4 The Registry: `tests/test_categories.toml`
```toml
# Schema for each [files.<name>] entry:
# fixture_class = "unit" | "mock_app" | "live_gui" | "headless" | "opt_in" | "performance"
# subsystems = list of strings (subsystem tags; cross-cutting tests list 2+)
# speed = "fast" | "medium" | "slow" | "very_slow"
# batch_group = string (sub-batching key within a tier)
# notes = free text (optional)
#
# Opt-in per-test order:
# [[files.<name>.test_order]]
# test_id = "test_foo::test_bar" # pytest node ID
# order = 10 # lower runs first; tests without entries sort after entries
# Cross-cutting GUI+DAG+Beads test (would be auto-classified as "gui" but actually
# touches 3 subsystems; registry overrides subsystems to be explicit)
[files.test_gui_dag_beads]
fixture_class = "live_gui"
subsystems = ["gui", "dag", "beads"]
speed = "slow"
batch_group = "gui"
notes = "Cross-cutting: drives GUI, asserts on DAG state, exercises Beads backend"
# Architectural boundary test (auto-classification would be ambiguous)
[files.test_arch_boundary_phase1]
fixture_class = "unit"
subsystems = ["architecture"]
speed = "fast"
batch_group = "core"
notes = "Phase 1 of the arch-boundary refactor; no fixture dependencies"
# Opt-in per-test order example
[[files.test_mma_ticket_actions.test_order]]
test_id = "test_mma_ticket_actions::test_blocked_ticket_does_not_execute"
order = 5
[[files.test_mma_ticket_actions.test_order]]
test_id = "test_mma_ticket_actions::test_priority_ordering"
order = 10
```
**Precedence:** registry entries always win. An auto-inferred `fixture_class = "unit"` is replaced by `fixture_class = "mock_app"` if the registry says so. This makes the registry the single source of truth for everything it touches, and the auto-inference is a sensible default for everything else.
### 3.5 Auto-Inference Rules
Implemented in `scripts/test_categorizer.py::auto_classify()`. Evaluated in order; first match wins:
| # | Rule | Match condition | Result |
|---|---|---|---|
| 1 | Opt-in filename | `test_clean_install` or `test_docker_build` prefix | `OPT_IN` |
| 2 | live_gui fixture | File contains `def test_.*\(live_gui\):` or `\(live_gui\)\s*[:,)]` regex match in source | `LIVE_GUI` |
| 3 | Mock app fixture | File references `mock_app` or `app_instance` (fixture name) | `MOCK_APP` |
| 4 | Headless service | File references headless-service fixtures (e.g. `headless_client`, `TestClient(app)`) | `HEADLESS` |
| 5 | Performance keyword | Filename matches `*perf*`, `*stress*`, `*phase_3_final*`, `*phase_4_stress*` | `PERFORMANCE` |
| 6 | Default | None of the above | `UNIT` |
**Subsystem auto-inference:** Take the longest known subsystem prefix from a curated list. Known prefixes (alphabetical for stable ordering): `ai`, `api`, `arch`, `ast`, `async`, `auto`, `beads`, `bias`, `cache`, `cli`, `cmd`, `comms`, `conductor`, `context`, `cost`, `dag`, `deepseek`, `diff`, `discussion`, `event`, `execution`, `external`, `ext`, `fuzzy`, `gemini`, `gui`, `headless`, `history`, `hooks`, `hot`, `imgui`, `layout`, `live`, `log`, `mcp`, `markdown`, `minimax`, `mma`, `model`, `orchestrator`, `outline`, `parallel`, `patch`, `perf`, `persona`, `phase`, `pipeline`, `preset`, `prior`, `process`, `project`, `provider`, `rag`, `script`, `session`, `shader`, `sim`, `skeleton`, `slice`, `spawn`, `status`, `subagent`, `summary`, `symbol`, `sync`, `synthesis`, `system`, `takes`, `theme`, `thinking`, `ticket`, `tier4`, `tiered`, `token`, `tool`, `track`, `tree`, `ts`, `undo`, `usage`, `user`, `vendor`, `view`, `visual`, `vlogger`, `websocket`, `workflow`, `workspace`, `z`.
**Speed auto-inference:** Read `.test_durations.json` if present (key = `<filename>::<test_id>`, value = seconds). Aggregate by file (p95). Map: `<1s` → FAST, `<5s` → MEDIUM, `<30s` → SLOW, else VERY_SLOW. If no history file, default to MEDIUM.
**Batch-group auto-inference:** Cluster subsystems into groups heuristically:
- `core` = `mcp`, `ai`, `context`, `api`, `dag`, `path`, `presets`, `personas`, `history`, `workspace`, `rag`, `beads`, `model`, `ast`, `async`, `cache`, `cli`, `cmd`, `fuzzy`, `hooks`, `log`, `markdown`, `orchestrator`, `outline`, `pipeline`, `project`, `provider`, `script`, `session`, `skeleton`, `slice`, `spawn`, `status`, `subagent`, `summary`, `symbol`, `sync`, `synthesis`, `system`, `takes`, `thinking`, `tier4`, `tiered`, `tool`, `track`, `tree`, `ts`, `usage`, `vendor`, `vlogger`, `websocket`, `workflow`
- `gui` = `gui`, `theme`, `imgui`, `layout`, `live`, `prior`, `visual`, `view`, `undo`
- `mma` = `mma`, `conductor`, `execution`, `ext`, `external`, `auto`, `manual`, `tier`, `arch`, `phase`, `process`, `z`
- `comms` = `comms`, `diff`, `patch`, `event`, `hot`, `process`, `shader`
- `headless` = `headless`
Single-subsystem tests use that subsystem's group. Multi-subsystem tests default to the group of the FIRST subsystem in their list (registry override can correct).
## 4. Components
### 4.1 `scripts/test_categorizer.py` — Pure classifier
```python
def auto_classify(path: Path, durations: dict[str, float] | None = None) -> CategoryRecord: ...
def load_registry(toml_path: Path) -> dict[str, dict]: ...
def merge_registry(auto: CategoryRecord, registry: dict) -> CategoryRecord: ...
def categorize_all(tests_dir: Path, registry_path: Path) -> list[CategoryRecord]: ...
```
Public API. No I/O at import time. Reads registry lazily. The `categorize_all` function returns one `CategoryRecord` per test file in `tests/`. Each record's `source` field is `"registry"` if the registry had any matching entry, else `"auto"`. Each record's `warnings` field is populated with any inconsistencies detected (e.g., auto-inferred fixture_class differs from registry).
### 4.2 `scripts/test_batcher.py` — Pure scheduler
```python
@dataclass(frozen=True)
class Batch:
tier: str # "0", "1", "2", "3", "H", "P"
label: str # "tier-1-unit-core"
files: list[Path]
pytest_args: list[str] # e.g. ["-n", "auto", "--maxfail=10"]
estimated_seconds: float
skip_reason: str | None = None # populated for skipped opt-in batches
def plan(
records: list[CategoryRecord],
*,
tiers: set[str] = {"0", "1", "2", "3", "H", "P"},
include_opt_in: bool = False,
xdist: bool = True,
) -> list[Batch]: ...
```
The `plan` function is deterministic. The same `records` + same `options` produce the same `list[Batch]`. This makes the planner trivially testable and makes the `--plan` dry-run mode a one-liner.
### 4.3 `scripts/run_tests_batched.py` — CLI orchestrator
Responsibilities (slim, delegates everything else):
1. Parse CLI args (`--tiers`, `--include-opt-in`, `--plan`, `--audit`, `--no-xdist`).
2. Call `categorize_all(tests_dir, registry_path)`.
3. If `--audit`: print records where `source == "auto"`, exit non-zero if any have empty subsystem lists or other hard errors. Exit 0 if every record is well-formed even if some are auto-inferred.
4. If `--plan`: print the batch list (one row per batch with label, files, estimated seconds) and exit.
5. Otherwise: call `plan()`, iterate batches, run each as `subprocess.run(uv + pytest + pytest_args + files)`, accumulate per-batch results, print the summary table.
6. Return the worst per-batch exit code (0 only if all batches pass).
The script is intentionally <150 lines. All logic lives in the two library modules.
### 4.4 `scripts/pytest_collection_order.py` — Conftest-loaded plugin
Hook: `pytest_collection_modifyitems(config, items)`. Reads `tests/test_categories.toml` once at session start, builds a `dict[str, int]` from `[[files.<name>.test_order]]` entries, then sorts items within each file by their order index. Items without an order index sort after items with one (preserves pytest's natural order for unannotated tests).
Registered via `tests/conftest.py`:
```python
pytest_plugins = ["scripts.pytest_collection_order"]
```
This is opt-in by design: if no `test_categories.toml` exists OR no `[[files.X.test_order]]` entries exist, the plugin is a no-op (zero items sorted, zero overhead).
## 5. Output / Report Format
After the run, the script prints a summary table:
```
[TIER 0] opt-in (clean_install) SKIPPED RUN_CLEAN_INSTALL_TEST not set
[TIER 0] opt-in (docker) SKIPPED RUN_DOCKER_TEST not set
[TIER 1] unit: core PASS 42/42 8.3s
[TIER 1] unit: gui PASS 17/17 2.1s
[TIER 1] unit: mma FAIL 12/13 1.8s ← test_mma_ticket_actions::test_x
[TIER 2] mock_app: core PASS 31/31 6.4s
[TIER 3] live_gui PASS 14/14 47.2s
[TIER H] headless PASS 3/3 4.0s
[TIER P] performance SKIPPED --tiers excludes P
[TOTAL] 5 tiers run, 119 tests, 70.0s, 1 failed
```
For Tier 3, the per-test failures are still in the regular pytest output (one pytest invocation); the summary line just reports the tier-level pass/fail.
## 6. CLI Surface
```powershell
# Default: all tiers except opt-in and performance; xdist on for tier 1
python scripts/run_tests_batched.py
# Skip slow/expensive stuff
python scripts/run_tests_batched.py --tiers 1,2
# Include opt-in tests (also requires the env var; the flag is a hard requirement
# so a CI run cannot accidentally enable them by exporting the env var)
python scripts/run_tests_batched.py --include-opt-in
# Dry-run: show the batch plan, don't run anything
python scripts/run_tests_batched.py --plan
# Audit: warn on unclassified (auto-inferred) files, list them, exit non-zero
python scripts/run_tests_batched.py --audit
# Disable xdist (e.g., when debugging a test that flakes under parallelism)
python scripts/run_tests_batched.py --no-xdist
# Override the tests directory or registry path
python scripts/run_tests_batched.py --tests-dir tests --registry tests/test_categories.toml
```
The `--include-opt-in` flag is **additive** to env var gating, not a replacement. A user must both set the env var AND pass the flag. This prevents accidental opt-in execution when an env var is set globally.
## 7. Configuration
### 7.1 `pyproject.toml` addition
```toml
[tool.pytest.ini_options]
addopts = ["-ra", "--strict-markers"] # add strict-markers to catch typos
markers = [
"integration: marks tests as integration tests (requires live GUI)",
"clean_install: clean install verification (opt-in via RUN_CLEAN_INSTALL_TEST=1)",
"docker: docker build and run test (opt-in via RUN_DOCKER_TEST=1)",
]
```
`--strict-markers` is opt-in via the script's `--strict-markers` flag, not added to `addopts` globally, to avoid breaking existing test runs that haven't been audited.
### 7.2 `.test_durations.json` (auto-generated, git-ignored)
Written by `run_tests_batched.py` after a successful run. Format:
```json
{
"tests/test_foo.py::test_bar": 0.043,
"tests/test_foo.py::test_baz": 1.234
}
```
Used by the categorizer for `speed` auto-inference. If absent, all files default to MEDIUM speed (no batch reordering). Add `tests/.test_durations.json` to `.gitignore` (or place under `tests/artifacts/`).
## 8. Migration / Rollout
| Phase | What | Risk |
|---|---|---|
| **Phase 1 — Library + dry-run** | Add `test_categorizer.py`, `test_batcher.py`, `pytest_collection_order.py`. Add `--plan` and `--audit` modes to a NEW script (don't replace the old one yet). Run on a clean clone; manually verify the plan matches the existing 4-at-a-time behavior (modulo opt-in gating). | None. Old script untouched. |
| **Phase 2 — Shadow run** | Run the new script in CI as a non-blocking job (informational only). Compare its pass/fail signature to the old script's. Investigate any divergence. | Low. Old script still authoritative. |
| **Phase 3 — Switch default** | Replace the old `run_tests_batched.py` with the new one. Update `docs/guide_testing.md` to point at the new section. Keep the old script under `scripts/run_tests_batched.py.legacy` for one cycle. | Medium. Mitigation: Phase 2 shadow run. |
| **Phase 4 — Cleanup** | Delete the legacy script. Add the registry file (`tests/test_categories.toml`) populated with the ~30 cross-cutting / ambiguous files identified during audit. Mark the remaining files as auto-inferred in the report. | Low. |
Each phase has its own implementation plan produced by the writing-plans skill.
## 9. Risks & Mitigations
| Risk | Likelihood | Impact | Mitigation |
|---|---|---|---|
| Auto-inference misclassifies a cross-cutting test, putting it in the wrong tier. | Medium | Medium (wrong fixture class could cause pollution) | `--audit` mode lists all auto-inferred records; CI gate on `--audit --strict` exits non-zero if any auto-classified file has multiple subsystems (a heuristic for "probably cross-cutting"). Registry overrides are one-line fixes. |
| Tier 3 (live_gui) shares one pytest process; one crash kills all live_gui tests for the run. | Low (existing behavior) | High (15s+ wasted + missing signal) | `--maxfail=1` for tier 3. Document the trade-off: faster average runtime, but a crash in one test forfeits the rest. |
| `pytest-xdist` introduces non-determinism in unit tests that share state via module globals. | Low | Medium | Audit scripts flag any unit test that mutates a module-level `src.*` global. Tests that do must be moved to Tier 2 (mock_app) or registered as `MOCK_APP` explicitly. |
| Speed auto-inference from `.test_durations.json` is stale. | Medium | Low (wrong `speed` field, not wrong tier) | `speed` affects only the summary table; tiers are determined by `fixture_class`. Stale speed data does not affect process isolation. |
| New tests added without a registry entry slip through unclassified. | Medium | Low | `--audit` mode warns; CI can gate on `--audit --strict` (planned for Phase 3). |
| `pytest_collection_order` plugin sorts items but tests have hard dependencies on collection order (e.g., shared module state). | Low | High | The plugin is opt-in per file. No `[[test_order]]` entries = natural pytest order. Document the contract in the plugin docstring. |
## 10. Open Questions
1. Should the registry live in `tests/` or at the repo root? (Proposal: `tests/test_categories.toml` so it lives next to the tests it describes.)
2. Should `batch_group` be inferred by default or required to be explicit? (Proposal: inferred by default; explicit in registry.)
3. Should we expose a `python scripts/run_tests_batched.py --tier 3 --file test_gui_dag_beads` mode for ad-hoc single-file runs? (Proposal: yes, defer to a follow-up plan.)
4. Should the speed auto-inference be updated incrementally (per run) or only on explicit `--record-durations` opt-in? (Proposal: per-run by default; the file is git-ignored so it's just a developer-local cache.)
## 11. See Also
- `docs/guide_testing.md` — current testing guide (will be updated in Phase 3 to reference the new script)
- `conductor/workflow.md` "Known Pitfalls (2026-06-05)" — `live_gui` session-scoped fixture gotchas
- `conductor/tracks/startup_speedup_20260606/` — example of a prior active track in this project (same convention)
@@ -0,0 +1,97 @@
# Track state for test_batching_refactor_20260606
# Updated by Tier 2 Tech Lead as tasks complete
[meta]
track_id = "test_batching_refactor_20260606"
name = "Test Batching Refactor"
status = "active"
current_phase = 0
last_updated = "2026-06-06"
[phases]
# Phase 1: Library + dry-run (categorizer + batcher + plugin, --plan/--audit modes)
phase_1 = { status = "pending", checkpoint_sha = "", name = "Library + dry-run modes" }
# Phase 2: Shadow run (compare new vs old in CI, no behavior change)
phase_2 = { status = "pending", checkpoint_sha = "", name = "Shadow run + divergence check" }
# Phase 3: Switch default (replace old script, update guide_testing.md)
phase_3 = { status = "pending", checkpoint_sha = "", name = "Switch default + docs update" }
# Phase 4: Cleanup (populate registry, delete legacy, archive track)
phase_4 = { status = "pending", checkpoint_sha = "", name = "Registry population + legacy removal" }
[tasks]
# Phase 1: Library + dry-run
# (Tasks TBD by writing-plans skill; placeholder structure only)
t1_1 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_auto_classify_opt_in_filename" }
t1_2 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_auto_classify_live_gui_fixture_scan" }
t1_3 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_auto_classify_mock_app_fixture_scan" }
t1_4 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_auto_classify_perf_keyword" }
t1_5 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_auto_classify_default_unit" }
t1_6 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_subsystem_inference_known_prefixes" }
t1_7 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_speed_inference_from_durations" }
t1_8 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_batch_group_inference" }
t1_9 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_merge_registry_overrides_auto" }
t1_10 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_categorize_all_277_files" }
t1_11 = { status = "pending", commit_sha = "", description = "Green: implement scripts/test_categorizer.py" }
t1_12 = { status = "pending", commit_sha = "", description = "Red: tests/test_batcher.py::test_plan_unit_tier_groups_by_batch_group" }
t1_13 = { status = "pending", commit_sha = "", description = "Red: tests/test_batcher.py::test_plan_live_gui_tier_one_invocation" }
t1_14 = { status = "pending", commit_sha = "", description = "Red: tests/test_batcher.py::test_plan_opt_in_skipped_without_flag" }
t1_15 = { status = "pending", commit_sha = "", description = "Red: tests/test_batcher.py::test_plan_deterministic" }
t1_16 = { status = "pending", commit_sha = "", description = "Red: tests/test_batcher.py::test_plan_xdist_only_for_tier_1" }
t1_17 = { status = "pending", commit_sha = "", description = "Green: implement scripts/test_batcher.py" }
t1_18 = { status = "pending", commit_sha = "", description = "Red: tests/test_pytest_collection_order.py::test_no_op_without_entries" }
t1_19 = { status = "pending", commit_sha = "", description = "Red: tests/test_pytest_collection_order.py::test_sorts_by_order_index" }
t1_20 = { status = "pending", commit_sha = "", description = "Green: implement scripts/pytest_collection_order.py" }
t1_21 = { status = "pending", commit_sha = "", description = "Wire pytest plugin in tests/conftest.py (pytest_plugins list)" }
t1_22 = { status = "pending", commit_sha = "", description = "Implement scripts/run_tests_batched.py with --plan and --audit modes only" }
t1_23 = { status = "pending", commit_sha = "", description = "Manually verify --plan output: all 277 files appear, tiers correctly assigned" }
t1_24 = { status = "pending", commit_sha = "", description = "Phase 1 checkpoint commit + git note" }
# Phase 2: Shadow run
t2_1 = { status = "pending", commit_sha = "", description = "Add CI workflow job: run new script in --tiers 1,2 mode; compare exit code to old script" }
t2_2 = { status = "pending", commit_sha = "", description = "Investigate any divergence; fix categorizer/batcher" }
t2_3 = { status = "pending", commit_sha = "", description = "Phase 2 checkpoint commit + git note" }
# Phase 3: Switch default
t3_1 = { status = "pending", commit_sha = "", description = "Add --include-opt-in and --tiers CLI handling to scripts/run_tests_batched.py" }
t3_2 = { status = "pending", commit_sha = "", description = "Add --durations record-on-success to scripts/run_tests_batched.py" }
t3_3 = { status = "pending", commit_sha = "", description = "Update docs/guide_testing.md 'Running Tests' section to reference new script" }
t3_4 = { status = "pending", commit_sha = "", description = "Rename old scripts/run_tests_batched.py to scripts/run_tests_batched.py.legacy" }
t3_5 = { status = "pending", commit_sha = "", description = "Phase 3 checkpoint commit + git note" }
# Phase 4: Cleanup
t4_1 = { status = "pending", commit_sha = "", description = "Run --audit on a clean clone; collect auto-inferred files" }
t4_2 = { status = "pending", commit_sha = "", description = "Populate tests/test_categories.toml with ~30 cross-cutting / ambiguous entries" }
t4_3 = { status = "pending", commit_sha = "", description = "Add tests/.test_durations.json to .gitignore" }
t4_4 = { status = "pending", commit_sha = "", description = "Delete scripts/run_tests_batched.py.legacy" }
t4_5 = { status = "pending", commit_sha = "", description = "Archive track: git mv conductor/tracks/test_batching_refactor_20260606/ conductor/tracks/archive/" }
t4_6 = { status = "pending", commit_sha = "", description = "Update conductor/tracks.md; move entry from Backlog to Recently Completed" }
t4_7 = { status = "pending", commit_sha = "", description = "Phase 4 checkpoint commit + git note" }
[verification]
# Filled at Phase 4
auto_classify_opt_in = false
auto_classify_live_gui = false
auto_classify_mock_app = false
auto_classify_perf = false
auto_classify_default_unit = false
subsystem_inference_known_prefixes = false
speed_inference_from_durations = false
batch_group_inference = false
merge_registry_overrides_auto = false
categorize_all_277_files = false
plan_unit_tier_groups_by_batch_group = false
plan_live_gui_tier_one_invocation = false
plan_opt_in_skipped_without_flag = false
plan_deterministic = false
plan_xdist_only_for_tier_1 = false
collection_order_no_op_without_entries = false
collection_order_sorts_by_order_index = false
plan_matches_4at_a_time = false
audit_exits_nonzero_on_hard_errors = false
opt_in_skipped_without_env_var = false
opt_in_skipped_without_include_flag = false
no_live_gui_in_same_invocation_as_others = false
existing_test_suite_passes = false
test_categorizer_coverage_pct = 0
test_batcher_coverage_pct = 0
[registry_overrides]
# Populated in Phase 4 T4.2; one entry per cross-cutting or ambiguous file
# Format: {file = "test_X.py", fixture_class = "...", subsystems = ["a", "b"], notes = "..."}