conductor(track): Initialize test_batching_refactor_20260606 spec
Three-tier batching refactor: replace alphabetical 4-at-a-time batching with fixture-class-isolated tiers (0 opt-in, 1 unit/xdist, 2 mock_app, 3 live_gui in one session, H headless, P performance). Hybrid classification: auto-infer from filename + AST fixture scan; hand-curated tests/test_categories.toml overrides for cross-cutting and ambiguous files. Opt-in per-test order control via [[files.X.test_order]] sub-tables, gated on a conftest-loaded pytest plugin (no-op without entries). Priority order: B (process isolation) > A (subsystem diagnostic) > C (speed).
This commit is contained in:
@@ -0,0 +1,77 @@
|
||||
{
|
||||
"track_id": "test_batching_refactor_20260606",
|
||||
"name": "Test Batching Refactor",
|
||||
"initialized": "2026-06-06",
|
||||
"owner": "tier2-tech-lead",
|
||||
"priority": "medium",
|
||||
"status": "active",
|
||||
"type": "developer tooling + diagnostic improvement",
|
||||
"scope": {
|
||||
"new_files": [
|
||||
"scripts/test_categorizer.py",
|
||||
"scripts/test_batcher.py",
|
||||
"scripts/pytest_collection_order.py",
|
||||
"tests/test_categories.toml",
|
||||
"tests/test_categorizer.py",
|
||||
"tests/test_batcher.py"
|
||||
],
|
||||
"modified_files": [
|
||||
"scripts/run_tests_batched.py",
|
||||
"tests/conftest.py",
|
||||
"pyproject.toml"
|
||||
],
|
||||
"deleted_files_at_phase4": [
|
||||
"scripts/run_tests_batched.py.legacy"
|
||||
]
|
||||
},
|
||||
"blocked_by": [],
|
||||
"blocks": [],
|
||||
"estimated_phases": 4,
|
||||
"spec": "spec.md",
|
||||
"plan": "plan.md",
|
||||
"priority_order": "B (process isolation by fixture class) > A (subsystem diagnostic grouping) > C (xdist + live_gui session reuse)",
|
||||
"tier_model": {
|
||||
"0_opt_in": "test_clean_install.py, test_docker_build.py; one batch per file; runs only if env var set AND --include-opt-in passed",
|
||||
"1_unit": "Pure unit tests (no live_gui/mock_app/app_instance); grouped by batch_group; pytest-xdist -n auto",
|
||||
"2_mock_app": "Tests using mock_app or app_instance fixtures; grouped by batch_group; no xdist",
|
||||
"3_live_gui": "All tests using live_gui fixture in ONE pytest invocation (session-scoped reuse)",
|
||||
"H_headless": "Headless service tests; one pytest invocation",
|
||||
"P_performance": "Performance/stress tests; runs last; one pytest invocation"
|
||||
},
|
||||
"hybrid_classification": "Auto-infer by default from filename and AST fixture scan; tests/test_categories.toml provides hand-curated overrides for cross-cutting and ambiguous files. Registry always wins precedence.",
|
||||
"architectural_invariant": "Every pytest subprocess invocation has a single, well-defined fixture profile. live_gui tests never share a pytest process with non-live_gui tests. Opt-in tests are gated on BOTH env var AND --include-opt-in CLI flag (defense in depth).",
|
||||
"cli_surface": {
|
||||
"default": "All tiers except opt-in (0) and performance (P); xdist enabled for tier 1",
|
||||
"--tiers": "Comma-separated tier list to include (e.g. --tiers 1,2,3)",
|
||||
"--include-opt-in": "Hard flag required IN ADDITION to env var to run opt-in tests",
|
||||
"--plan": "Dry-run; print batch plan and exit",
|
||||
"--audit": "List auto-inferred (unclassified) files; exit non-zero on hard errors",
|
||||
"--no-xdist": "Disable pytest-xdist for tier 1 (debug aid)",
|
||||
"--strict-markers": "Pass --strict-markers to pytest (catch marker typos)"
|
||||
},
|
||||
"verification_criteria": [
|
||||
"scripts/test_categorizer.py::categorize_all returns 277+ CategoryRecords with no exceptions",
|
||||
"scripts/test_batcher.py::plan is deterministic (same inputs -> same outputs)",
|
||||
"All 277+ test files are correctly classified: live_gui / mock_app / unit / opt_in / performance",
|
||||
"Cross-cutting files (test_gui_dag_beads, test_arch_boundary_phase*, etc.) are flagged with multiple subsystems in the report",
|
||||
"--plan output matches the existing 4-at-a-time batching modulo opt-in gating",
|
||||
"No live_gui test ever runs in the same pytest invocation as a non-live_gui test",
|
||||
"Opt-in tests are skipped silently when env var is not set (no warning, no error)",
|
||||
"Opt-in tests are skipped silently when --include-opt-in is not passed (env var alone is insufficient)",
|
||||
"scripts/check_test_toml_paths.py still exits 0 (no real TOML references in tests)",
|
||||
"Existing 273+ test suite passes when run via the new script in --tiers 1,2,3 mode",
|
||||
"tests/test_categorizer.py and tests/test_batcher.py pass with >80% coverage",
|
||||
"pytest_collection_order plugin is a no-op when no [[test_order]] entries exist (zero overhead)"
|
||||
],
|
||||
"links": {
|
||||
"backlog_entry": "conductor/tracks.md (to be added at top of Remaining Backlog)",
|
||||
"current_script": "scripts/run_tests_batched.py",
|
||||
"testing_guide": "docs/guide_testing.md",
|
||||
"workflow_pitfalls": "conductor/workflow.md#known-pitfalls-2026-06-05",
|
||||
"related_tracks": [
|
||||
"conductor/tracks/startup_speedup_20260606/",
|
||||
"conductor/tracks/regression_fixes_20260605/",
|
||||
"conductor/tracks/live_gui_test_hardening_v2_20260605/"
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,348 @@
|
||||
# Track: Test Batching Refactor
|
||||
|
||||
**Status:** Active (spec approved 2026-06-06)
|
||||
**Initialized:** 2026-06-06
|
||||
**Owner:** Tier 2 Tech Lead
|
||||
**Priority:** Medium (developer ergonomics + diagnostic improvement; not a regression blocker)
|
||||
|
||||
---
|
||||
|
||||
## 1. Problem Statement
|
||||
|
||||
The current test batching script (`scripts/run_tests_batched.py`, 36 lines) groups test files alphabetically in chunks of 4 with `pytest --maxfail=10`. This produces three concrete failure modes:
|
||||
|
||||
1. **Zero diagnostic signal on failure.** When batch 17 fails, the user sees four unrelated filenames and a traceback. There is no way to know which subsystem broke without re-running individual files.
|
||||
2. **No awareness of `live_gui` session-scoped fixture.** The `conductor/workflow.md` Known Pitfalls (2026-06-05) explicitly document that `live_gui` is session-scoped and that tests assuming a clean ImGui state are fragile. The current script *accidentally* avoids cross-batch pollution (each batch is a fresh `subprocess.run`) but is one refactor away from breaking that.
|
||||
3. **No awareness of opt-in tests.** `test_clean_install.py` and `test_docker_build.py` are gated on environment variables but have no marker-based enforcement; running the script on a fresh clone can spuriously invoke them.
|
||||
|
||||
The script's 4-at-a-time batching also has the property that fast unit tests and slow live_gui tests can be mixed in the same pytest invocation if the order changes — the alphabetical sort happens to interleave them.
|
||||
|
||||
## 2. Goals (Priority Order)
|
||||
|
||||
| Priority | Goal | Rationale |
|
||||
|---|---|---|
|
||||
| **B (foundational)** | Process isolation by fixture class. live_gui never shares a pytest process with non-live_gui tests. | `live_gui` is session-scoped; mixing in the same `pytest` invocation causes state pollution. workflow.md 2026-06-05 gotchas are explicit. |
|
||||
| **B (foundational)** | Opt-in tests gated on env var, skipped silently otherwise. | `test_clean_install.py` clones the repo; `test_docker_build.py` builds an image. Running these by default is wrong. |
|
||||
| **A (primary value)** | Diagnostic precision via subsystem grouping. When a batch fails, the report names the subsystem. | The user's stated complaint: "naive alphabetical groupings" provide no signal. |
|
||||
| **A (primary value)** | Warn on unclassified files (registry miss), do not fail the run. | New tests should be flagged for human review without blocking the suite. |
|
||||
| **C (optimization)** | Tier-1 (unit) parallelism via `pytest-xdist`. | Pure unit tests are independent; xdist is a free 2-4x speedup there. |
|
||||
| **C (optimization)** | Live-gui session reuse (all `*_sim.py` in one pytest invocation). | Each fresh `sloppy.py` startup costs ~15s. Reusing the session is the only way to keep live_gui runtime sane. |
|
||||
| **Nice-to-have** | Opt-in per-test order control via the registry. | When test B is known to depend on test A's side effect, ordering matters. Optional; zero impact when unused. |
|
||||
|
||||
### 2.1 Non-Goals
|
||||
|
||||
- **Not** changing the underlying test framework (pytest stays).
|
||||
- **Not** restructuring test files into subdirectories (the flat `tests/` layout is preserved).
|
||||
- **Not** introducing new pytest markers on the test functions themselves. The categorization lives in a single registry file, not on the test code.
|
||||
- **Not** making the script required for CI today. The existing `uv run pytest tests/ -v` invocation keeps working; this script is a developer ergonomics + diagnostic tool.
|
||||
|
||||
## 3. Architecture
|
||||
|
||||
### 3.1 Three-Tier Model (Fixture Class as Primary Axis)
|
||||
|
||||
```
|
||||
tests/
|
||||
conftest.py # pytest plugin entry: registers collection_order plugin
|
||||
test_categories.toml # hand-curated overrides + classification
|
||||
artifacts/ # git-ignored; test outputs (unchanged)
|
||||
logs/ # git-ignored; live_gui logs (unchanged)
|
||||
*.py # test files (unchanged)
|
||||
|
||||
scripts/
|
||||
run_tests_batched.py # REPLACED: now the orchestrator
|
||||
pytest_collection_order.py # NEW: conftest-loaded plugin for opt-in order control
|
||||
test_categorizer.py # NEW: classifier library (auto-infer + registry)
|
||||
test_batcher.py # NEW: scheduler library (turn categories into batches)
|
||||
```
|
||||
|
||||
The categorizer is a pure function: `categorize(filename) -> CategoryRecord`. The batcher is a pure function: `plan(categories, options) -> list[Batch]`. The script is the CLI shell that wires the two together and shells out to `pytest`.
|
||||
|
||||
### 3.2 Data Model
|
||||
|
||||
```python
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
|
||||
class FixtureClass(str, Enum):
|
||||
UNIT = "unit"
|
||||
MOCK_APP = "mock_app"
|
||||
LIVE_GUI = "live_gui"
|
||||
HEADLESS = "headless"
|
||||
OPT_IN = "opt_in"
|
||||
PERFORMANCE = "performance"
|
||||
|
||||
class Speed(str, Enum):
|
||||
FAST = "fast" # <1s typical
|
||||
MEDIUM = "medium" # 1-5s
|
||||
SLOW = "slow" # 5-30s
|
||||
VERY_SLOW = "very_slow" # >30s
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class CategoryRecord:
|
||||
filename: str
|
||||
fixture_class: FixtureClass
|
||||
subsystems: list[str] # 1..N; multi-subsystem for cross-cutting
|
||||
speed: Speed
|
||||
batch_group: str # groups files within a tier for sub-batching
|
||||
notes: str = ""
|
||||
# Per-test order (opt-in). Default empty dict means natural pytest order.
|
||||
test_order: dict[str, int] = field(default_factory=dict)
|
||||
# Provenance: where did the classification come from?
|
||||
source: str = "auto" # "auto" | "registry"
|
||||
warnings: list[str] = field(default_factory=list)
|
||||
```
|
||||
|
||||
### 3.3 The Six Tiers (Batches = pytest Subprocess Invocations)
|
||||
|
||||
| Tier | FixtureClass | Batch strategy | xdist | Max-fail |
|
||||
|---|---|---|---|---|
|
||||
| **0** | `OPT_IN` | One pytest invocation per file; runs only if env var is set. Skipped silently otherwise. | no | 1 |
|
||||
| **1** | `UNIT` | Grouped by `batch_group` into ~5–8 pytest invocations. | `-n auto` | 10 |
|
||||
| **2** | `MOCK_APP` | Grouped by `batch_group` into ~3–5 pytest invocations. | no (single App instance) | 5 |
|
||||
| **3** | `LIVE_GUI` | **One pytest invocation for all live_gui files.** Session-scoped reuse. Sub-report groups by subsystem via `--co`-derived reporting (post-hoc, from collected test IDs). | no | 1 (session crash = nuke) |
|
||||
| **H** | `HEADLESS` | One pytest invocation; all headless service tests together. | no | 5 |
|
||||
| **P** | `PERFORMANCE` | One pytest invocation; runs last so failures don't block the main feedback loop. | no | 1 |
|
||||
|
||||
The ordering is: **0 → 1 → 2 → 3 → H → P** (opt-in first, perf last).
|
||||
|
||||
### 3.4 The Registry: `tests/test_categories.toml`
|
||||
|
||||
```toml
|
||||
# Schema for each [files.<name>] entry:
|
||||
# fixture_class = "unit" | "mock_app" | "live_gui" | "headless" | "opt_in" | "performance"
|
||||
# subsystems = list of strings (subsystem tags; cross-cutting tests list 2+)
|
||||
# speed = "fast" | "medium" | "slow" | "very_slow"
|
||||
# batch_group = string (sub-batching key within a tier)
|
||||
# notes = free text (optional)
|
||||
#
|
||||
# Opt-in per-test order:
|
||||
# [[files.<name>.test_order]]
|
||||
# test_id = "test_foo::test_bar" # pytest node ID
|
||||
# order = 10 # lower runs first; tests without entries sort after entries
|
||||
|
||||
# Cross-cutting GUI+DAG+Beads test (would be auto-classified as "gui" but actually
|
||||
# touches 3 subsystems; registry overrides subsystems to be explicit)
|
||||
[files.test_gui_dag_beads]
|
||||
fixture_class = "live_gui"
|
||||
subsystems = ["gui", "dag", "beads"]
|
||||
speed = "slow"
|
||||
batch_group = "gui"
|
||||
notes = "Cross-cutting: drives GUI, asserts on DAG state, exercises Beads backend"
|
||||
|
||||
# Architectural boundary test (auto-classification would be ambiguous)
|
||||
[files.test_arch_boundary_phase1]
|
||||
fixture_class = "unit"
|
||||
subsystems = ["architecture"]
|
||||
speed = "fast"
|
||||
batch_group = "core"
|
||||
notes = "Phase 1 of the arch-boundary refactor; no fixture dependencies"
|
||||
|
||||
# Opt-in per-test order example
|
||||
[[files.test_mma_ticket_actions.test_order]]
|
||||
test_id = "test_mma_ticket_actions::test_blocked_ticket_does_not_execute"
|
||||
order = 5
|
||||
|
||||
[[files.test_mma_ticket_actions.test_order]]
|
||||
test_id = "test_mma_ticket_actions::test_priority_ordering"
|
||||
order = 10
|
||||
```
|
||||
|
||||
**Precedence:** registry entries always win. An auto-inferred `fixture_class = "unit"` is replaced by `fixture_class = "mock_app"` if the registry says so. This makes the registry the single source of truth for everything it touches, and the auto-inference is a sensible default for everything else.
|
||||
|
||||
### 3.5 Auto-Inference Rules
|
||||
|
||||
Implemented in `scripts/test_categorizer.py::auto_classify()`. Evaluated in order; first match wins:
|
||||
|
||||
| # | Rule | Match condition | Result |
|
||||
|---|---|---|---|
|
||||
| 1 | Opt-in filename | `test_clean_install` or `test_docker_build` prefix | `OPT_IN` |
|
||||
| 2 | live_gui fixture | File contains `def test_.*\(live_gui\):` or `\(live_gui\)\s*[:,)]` regex match in source | `LIVE_GUI` |
|
||||
| 3 | Mock app fixture | File references `mock_app` or `app_instance` (fixture name) | `MOCK_APP` |
|
||||
| 4 | Headless service | File references headless-service fixtures (e.g. `headless_client`, `TestClient(app)`) | `HEADLESS` |
|
||||
| 5 | Performance keyword | Filename matches `*perf*`, `*stress*`, `*phase_3_final*`, `*phase_4_stress*` | `PERFORMANCE` |
|
||||
| 6 | Default | None of the above | `UNIT` |
|
||||
|
||||
**Subsystem auto-inference:** Take the longest known subsystem prefix from a curated list. Known prefixes (alphabetical for stable ordering): `ai`, `api`, `arch`, `ast`, `async`, `auto`, `beads`, `bias`, `cache`, `cli`, `cmd`, `comms`, `conductor`, `context`, `cost`, `dag`, `deepseek`, `diff`, `discussion`, `event`, `execution`, `external`, `ext`, `fuzzy`, `gemini`, `gui`, `headless`, `history`, `hooks`, `hot`, `imgui`, `layout`, `live`, `log`, `mcp`, `markdown`, `minimax`, `mma`, `model`, `orchestrator`, `outline`, `parallel`, `patch`, `perf`, `persona`, `phase`, `pipeline`, `preset`, `prior`, `process`, `project`, `provider`, `rag`, `script`, `session`, `shader`, `sim`, `skeleton`, `slice`, `spawn`, `status`, `subagent`, `summary`, `symbol`, `sync`, `synthesis`, `system`, `takes`, `theme`, `thinking`, `ticket`, `tier4`, `tiered`, `token`, `tool`, `track`, `tree`, `ts`, `undo`, `usage`, `user`, `vendor`, `view`, `visual`, `vlogger`, `websocket`, `workflow`, `workspace`, `z`.
|
||||
|
||||
**Speed auto-inference:** Read `.test_durations.json` if present (key = `<filename>::<test_id>`, value = seconds). Aggregate by file (p95). Map: `<1s` → FAST, `<5s` → MEDIUM, `<30s` → SLOW, else VERY_SLOW. If no history file, default to MEDIUM.
|
||||
|
||||
**Batch-group auto-inference:** Cluster subsystems into groups heuristically:
|
||||
- `core` = `mcp`, `ai`, `context`, `api`, `dag`, `path`, `presets`, `personas`, `history`, `workspace`, `rag`, `beads`, `model`, `ast`, `async`, `cache`, `cli`, `cmd`, `fuzzy`, `hooks`, `log`, `markdown`, `orchestrator`, `outline`, `pipeline`, `project`, `provider`, `script`, `session`, `skeleton`, `slice`, `spawn`, `status`, `subagent`, `summary`, `symbol`, `sync`, `synthesis`, `system`, `takes`, `thinking`, `tier4`, `tiered`, `tool`, `track`, `tree`, `ts`, `usage`, `vendor`, `vlogger`, `websocket`, `workflow`
|
||||
- `gui` = `gui`, `theme`, `imgui`, `layout`, `live`, `prior`, `visual`, `view`, `undo`
|
||||
- `mma` = `mma`, `conductor`, `execution`, `ext`, `external`, `auto`, `manual`, `tier`, `arch`, `phase`, `process`, `z`
|
||||
- `comms` = `comms`, `diff`, `patch`, `event`, `hot`, `process`, `shader`
|
||||
- `headless` = `headless`
|
||||
|
||||
Single-subsystem tests use that subsystem's group. Multi-subsystem tests default to the group of the FIRST subsystem in their list (registry override can correct).
|
||||
|
||||
## 4. Components
|
||||
|
||||
### 4.1 `scripts/test_categorizer.py` — Pure classifier
|
||||
|
||||
```python
|
||||
def auto_classify(path: Path, durations: dict[str, float] | None = None) -> CategoryRecord: ...
|
||||
def load_registry(toml_path: Path) -> dict[str, dict]: ...
|
||||
def merge_registry(auto: CategoryRecord, registry: dict) -> CategoryRecord: ...
|
||||
def categorize_all(tests_dir: Path, registry_path: Path) -> list[CategoryRecord]: ...
|
||||
```
|
||||
|
||||
Public API. No I/O at import time. Reads registry lazily. The `categorize_all` function returns one `CategoryRecord` per test file in `tests/`. Each record's `source` field is `"registry"` if the registry had any matching entry, else `"auto"`. Each record's `warnings` field is populated with any inconsistencies detected (e.g., auto-inferred fixture_class differs from registry).
|
||||
|
||||
### 4.2 `scripts/test_batcher.py` — Pure scheduler
|
||||
|
||||
```python
|
||||
@dataclass(frozen=True)
|
||||
class Batch:
|
||||
tier: str # "0", "1", "2", "3", "H", "P"
|
||||
label: str # "tier-1-unit-core"
|
||||
files: list[Path]
|
||||
pytest_args: list[str] # e.g. ["-n", "auto", "--maxfail=10"]
|
||||
estimated_seconds: float
|
||||
skip_reason: str | None = None # populated for skipped opt-in batches
|
||||
|
||||
def plan(
|
||||
records: list[CategoryRecord],
|
||||
*,
|
||||
tiers: set[str] = {"0", "1", "2", "3", "H", "P"},
|
||||
include_opt_in: bool = False,
|
||||
xdist: bool = True,
|
||||
) -> list[Batch]: ...
|
||||
```
|
||||
|
||||
The `plan` function is deterministic. The same `records` + same `options` produce the same `list[Batch]`. This makes the planner trivially testable and makes the `--plan` dry-run mode a one-liner.
|
||||
|
||||
### 4.3 `scripts/run_tests_batched.py` — CLI orchestrator
|
||||
|
||||
Responsibilities (slim, delegates everything else):
|
||||
1. Parse CLI args (`--tiers`, `--include-opt-in`, `--plan`, `--audit`, `--no-xdist`).
|
||||
2. Call `categorize_all(tests_dir, registry_path)`.
|
||||
3. If `--audit`: print records where `source == "auto"`, exit non-zero if any have empty subsystem lists or other hard errors. Exit 0 if every record is well-formed even if some are auto-inferred.
|
||||
4. If `--plan`: print the batch list (one row per batch with label, files, estimated seconds) and exit.
|
||||
5. Otherwise: call `plan()`, iterate batches, run each as `subprocess.run(uv + pytest + pytest_args + files)`, accumulate per-batch results, print the summary table.
|
||||
6. Return the worst per-batch exit code (0 only if all batches pass).
|
||||
|
||||
The script is intentionally <150 lines. All logic lives in the two library modules.
|
||||
|
||||
### 4.4 `scripts/pytest_collection_order.py` — Conftest-loaded plugin
|
||||
|
||||
Hook: `pytest_collection_modifyitems(config, items)`. Reads `tests/test_categories.toml` once at session start, builds a `dict[str, int]` from `[[files.<name>.test_order]]` entries, then sorts items within each file by their order index. Items without an order index sort after items with one (preserves pytest's natural order for unannotated tests).
|
||||
|
||||
Registered via `tests/conftest.py`:
|
||||
|
||||
```python
|
||||
pytest_plugins = ["scripts.pytest_collection_order"]
|
||||
```
|
||||
|
||||
This is opt-in by design: if no `test_categories.toml` exists OR no `[[files.X.test_order]]` entries exist, the plugin is a no-op (zero items sorted, zero overhead).
|
||||
|
||||
## 5. Output / Report Format
|
||||
|
||||
After the run, the script prints a summary table:
|
||||
|
||||
```
|
||||
[TIER 0] opt-in (clean_install) SKIPPED RUN_CLEAN_INSTALL_TEST not set
|
||||
[TIER 0] opt-in (docker) SKIPPED RUN_DOCKER_TEST not set
|
||||
[TIER 1] unit: core PASS 42/42 8.3s
|
||||
[TIER 1] unit: gui PASS 17/17 2.1s
|
||||
[TIER 1] unit: mma FAIL 12/13 1.8s ← test_mma_ticket_actions::test_x
|
||||
[TIER 2] mock_app: core PASS 31/31 6.4s
|
||||
[TIER 3] live_gui PASS 14/14 47.2s
|
||||
[TIER H] headless PASS 3/3 4.0s
|
||||
[TIER P] performance SKIPPED --tiers excludes P
|
||||
[TOTAL] 5 tiers run, 119 tests, 70.0s, 1 failed
|
||||
```
|
||||
|
||||
For Tier 3, the per-test failures are still in the regular pytest output (one pytest invocation); the summary line just reports the tier-level pass/fail.
|
||||
|
||||
## 6. CLI Surface
|
||||
|
||||
```powershell
|
||||
# Default: all tiers except opt-in and performance; xdist on for tier 1
|
||||
python scripts/run_tests_batched.py
|
||||
|
||||
# Skip slow/expensive stuff
|
||||
python scripts/run_tests_batched.py --tiers 1,2
|
||||
|
||||
# Include opt-in tests (also requires the env var; the flag is a hard requirement
|
||||
# so a CI run cannot accidentally enable them by exporting the env var)
|
||||
python scripts/run_tests_batched.py --include-opt-in
|
||||
|
||||
# Dry-run: show the batch plan, don't run anything
|
||||
python scripts/run_tests_batched.py --plan
|
||||
|
||||
# Audit: warn on unclassified (auto-inferred) files, list them, exit non-zero
|
||||
python scripts/run_tests_batched.py --audit
|
||||
|
||||
# Disable xdist (e.g., when debugging a test that flakes under parallelism)
|
||||
python scripts/run_tests_batched.py --no-xdist
|
||||
|
||||
# Override the tests directory or registry path
|
||||
python scripts/run_tests_batched.py --tests-dir tests --registry tests/test_categories.toml
|
||||
```
|
||||
|
||||
The `--include-opt-in` flag is **additive** to env var gating, not a replacement. A user must both set the env var AND pass the flag. This prevents accidental opt-in execution when an env var is set globally.
|
||||
|
||||
## 7. Configuration
|
||||
|
||||
### 7.1 `pyproject.toml` addition
|
||||
|
||||
```toml
|
||||
[tool.pytest.ini_options]
|
||||
addopts = ["-ra", "--strict-markers"] # add strict-markers to catch typos
|
||||
markers = [
|
||||
"integration: marks tests as integration tests (requires live GUI)",
|
||||
"clean_install: clean install verification (opt-in via RUN_CLEAN_INSTALL_TEST=1)",
|
||||
"docker: docker build and run test (opt-in via RUN_DOCKER_TEST=1)",
|
||||
]
|
||||
```
|
||||
|
||||
`--strict-markers` is opt-in via the script's `--strict-markers` flag, not added to `addopts` globally, to avoid breaking existing test runs that haven't been audited.
|
||||
|
||||
### 7.2 `.test_durations.json` (auto-generated, git-ignored)
|
||||
|
||||
Written by `run_tests_batched.py` after a successful run. Format:
|
||||
|
||||
```json
|
||||
{
|
||||
"tests/test_foo.py::test_bar": 0.043,
|
||||
"tests/test_foo.py::test_baz": 1.234
|
||||
}
|
||||
```
|
||||
|
||||
Used by the categorizer for `speed` auto-inference. If absent, all files default to MEDIUM speed (no batch reordering). Add `tests/.test_durations.json` to `.gitignore` (or place under `tests/artifacts/`).
|
||||
|
||||
## 8. Migration / Rollout
|
||||
|
||||
| Phase | What | Risk |
|
||||
|---|---|---|
|
||||
| **Phase 1 — Library + dry-run** | Add `test_categorizer.py`, `test_batcher.py`, `pytest_collection_order.py`. Add `--plan` and `--audit` modes to a NEW script (don't replace the old one yet). Run on a clean clone; manually verify the plan matches the existing 4-at-a-time behavior (modulo opt-in gating). | None. Old script untouched. |
|
||||
| **Phase 2 — Shadow run** | Run the new script in CI as a non-blocking job (informational only). Compare its pass/fail signature to the old script's. Investigate any divergence. | Low. Old script still authoritative. |
|
||||
| **Phase 3 — Switch default** | Replace the old `run_tests_batched.py` with the new one. Update `docs/guide_testing.md` to point at the new section. Keep the old script under `scripts/run_tests_batched.py.legacy` for one cycle. | Medium. Mitigation: Phase 2 shadow run. |
|
||||
| **Phase 4 — Cleanup** | Delete the legacy script. Add the registry file (`tests/test_categories.toml`) populated with the ~30 cross-cutting / ambiguous files identified during audit. Mark the remaining files as auto-inferred in the report. | Low. |
|
||||
|
||||
Each phase has its own implementation plan produced by the writing-plans skill.
|
||||
|
||||
## 9. Risks & Mitigations
|
||||
|
||||
| Risk | Likelihood | Impact | Mitigation |
|
||||
|---|---|---|---|
|
||||
| Auto-inference misclassifies a cross-cutting test, putting it in the wrong tier. | Medium | Medium (wrong fixture class could cause pollution) | `--audit` mode lists all auto-inferred records; CI gate on `--audit --strict` exits non-zero if any auto-classified file has multiple subsystems (a heuristic for "probably cross-cutting"). Registry overrides are one-line fixes. |
|
||||
| Tier 3 (live_gui) shares one pytest process; one crash kills all live_gui tests for the run. | Low (existing behavior) | High (15s+ wasted + missing signal) | `--maxfail=1` for tier 3. Document the trade-off: faster average runtime, but a crash in one test forfeits the rest. |
|
||||
| `pytest-xdist` introduces non-determinism in unit tests that share state via module globals. | Low | Medium | Audit scripts flag any unit test that mutates a module-level `src.*` global. Tests that do must be moved to Tier 2 (mock_app) or registered as `MOCK_APP` explicitly. |
|
||||
| Speed auto-inference from `.test_durations.json` is stale. | Medium | Low (wrong `speed` field, not wrong tier) | `speed` affects only the summary table; tiers are determined by `fixture_class`. Stale speed data does not affect process isolation. |
|
||||
| New tests added without a registry entry slip through unclassified. | Medium | Low | `--audit` mode warns; CI can gate on `--audit --strict` (planned for Phase 3). |
|
||||
| `pytest_collection_order` plugin sorts items but tests have hard dependencies on collection order (e.g., shared module state). | Low | High | The plugin is opt-in per file. No `[[test_order]]` entries = natural pytest order. Document the contract in the plugin docstring. |
|
||||
|
||||
## 10. Open Questions
|
||||
|
||||
1. Should the registry live in `tests/` or at the repo root? (Proposal: `tests/test_categories.toml` so it lives next to the tests it describes.)
|
||||
2. Should `batch_group` be inferred by default or required to be explicit? (Proposal: inferred by default; explicit in registry.)
|
||||
3. Should we expose a `python scripts/run_tests_batched.py --tier 3 --file test_gui_dag_beads` mode for ad-hoc single-file runs? (Proposal: yes, defer to a follow-up plan.)
|
||||
4. Should the speed auto-inference be updated incrementally (per run) or only on explicit `--record-durations` opt-in? (Proposal: per-run by default; the file is git-ignored so it's just a developer-local cache.)
|
||||
|
||||
## 11. See Also
|
||||
|
||||
- `docs/guide_testing.md` — current testing guide (will be updated in Phase 3 to reference the new script)
|
||||
- `conductor/workflow.md` "Known Pitfalls (2026-06-05)" — `live_gui` session-scoped fixture gotchas
|
||||
- `conductor/tracks/startup_speedup_20260606/` — example of a prior active track in this project (same convention)
|
||||
@@ -0,0 +1,97 @@
|
||||
# Track state for test_batching_refactor_20260606
|
||||
# Updated by Tier 2 Tech Lead as tasks complete
|
||||
|
||||
[meta]
|
||||
track_id = "test_batching_refactor_20260606"
|
||||
name = "Test Batching Refactor"
|
||||
status = "active"
|
||||
current_phase = 0
|
||||
last_updated = "2026-06-06"
|
||||
|
||||
[phases]
|
||||
# Phase 1: Library + dry-run (categorizer + batcher + plugin, --plan/--audit modes)
|
||||
phase_1 = { status = "pending", checkpoint_sha = "", name = "Library + dry-run modes" }
|
||||
# Phase 2: Shadow run (compare new vs old in CI, no behavior change)
|
||||
phase_2 = { status = "pending", checkpoint_sha = "", name = "Shadow run + divergence check" }
|
||||
# Phase 3: Switch default (replace old script, update guide_testing.md)
|
||||
phase_3 = { status = "pending", checkpoint_sha = "", name = "Switch default + docs update" }
|
||||
# Phase 4: Cleanup (populate registry, delete legacy, archive track)
|
||||
phase_4 = { status = "pending", checkpoint_sha = "", name = "Registry population + legacy removal" }
|
||||
|
||||
[tasks]
|
||||
# Phase 1: Library + dry-run
|
||||
# (Tasks TBD by writing-plans skill; placeholder structure only)
|
||||
t1_1 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_auto_classify_opt_in_filename" }
|
||||
t1_2 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_auto_classify_live_gui_fixture_scan" }
|
||||
t1_3 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_auto_classify_mock_app_fixture_scan" }
|
||||
t1_4 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_auto_classify_perf_keyword" }
|
||||
t1_5 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_auto_classify_default_unit" }
|
||||
t1_6 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_subsystem_inference_known_prefixes" }
|
||||
t1_7 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_speed_inference_from_durations" }
|
||||
t1_8 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_batch_group_inference" }
|
||||
t1_9 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_merge_registry_overrides_auto" }
|
||||
t1_10 = { status = "pending", commit_sha = "", description = "Red: tests/test_categorizer.py::test_categorize_all_277_files" }
|
||||
t1_11 = { status = "pending", commit_sha = "", description = "Green: implement scripts/test_categorizer.py" }
|
||||
t1_12 = { status = "pending", commit_sha = "", description = "Red: tests/test_batcher.py::test_plan_unit_tier_groups_by_batch_group" }
|
||||
t1_13 = { status = "pending", commit_sha = "", description = "Red: tests/test_batcher.py::test_plan_live_gui_tier_one_invocation" }
|
||||
t1_14 = { status = "pending", commit_sha = "", description = "Red: tests/test_batcher.py::test_plan_opt_in_skipped_without_flag" }
|
||||
t1_15 = { status = "pending", commit_sha = "", description = "Red: tests/test_batcher.py::test_plan_deterministic" }
|
||||
t1_16 = { status = "pending", commit_sha = "", description = "Red: tests/test_batcher.py::test_plan_xdist_only_for_tier_1" }
|
||||
t1_17 = { status = "pending", commit_sha = "", description = "Green: implement scripts/test_batcher.py" }
|
||||
t1_18 = { status = "pending", commit_sha = "", description = "Red: tests/test_pytest_collection_order.py::test_no_op_without_entries" }
|
||||
t1_19 = { status = "pending", commit_sha = "", description = "Red: tests/test_pytest_collection_order.py::test_sorts_by_order_index" }
|
||||
t1_20 = { status = "pending", commit_sha = "", description = "Green: implement scripts/pytest_collection_order.py" }
|
||||
t1_21 = { status = "pending", commit_sha = "", description = "Wire pytest plugin in tests/conftest.py (pytest_plugins list)" }
|
||||
t1_22 = { status = "pending", commit_sha = "", description = "Implement scripts/run_tests_batched.py with --plan and --audit modes only" }
|
||||
t1_23 = { status = "pending", commit_sha = "", description = "Manually verify --plan output: all 277 files appear, tiers correctly assigned" }
|
||||
t1_24 = { status = "pending", commit_sha = "", description = "Phase 1 checkpoint commit + git note" }
|
||||
# Phase 2: Shadow run
|
||||
t2_1 = { status = "pending", commit_sha = "", description = "Add CI workflow job: run new script in --tiers 1,2 mode; compare exit code to old script" }
|
||||
t2_2 = { status = "pending", commit_sha = "", description = "Investigate any divergence; fix categorizer/batcher" }
|
||||
t2_3 = { status = "pending", commit_sha = "", description = "Phase 2 checkpoint commit + git note" }
|
||||
# Phase 3: Switch default
|
||||
t3_1 = { status = "pending", commit_sha = "", description = "Add --include-opt-in and --tiers CLI handling to scripts/run_tests_batched.py" }
|
||||
t3_2 = { status = "pending", commit_sha = "", description = "Add --durations record-on-success to scripts/run_tests_batched.py" }
|
||||
t3_3 = { status = "pending", commit_sha = "", description = "Update docs/guide_testing.md 'Running Tests' section to reference new script" }
|
||||
t3_4 = { status = "pending", commit_sha = "", description = "Rename old scripts/run_tests_batched.py to scripts/run_tests_batched.py.legacy" }
|
||||
t3_5 = { status = "pending", commit_sha = "", description = "Phase 3 checkpoint commit + git note" }
|
||||
# Phase 4: Cleanup
|
||||
t4_1 = { status = "pending", commit_sha = "", description = "Run --audit on a clean clone; collect auto-inferred files" }
|
||||
t4_2 = { status = "pending", commit_sha = "", description = "Populate tests/test_categories.toml with ~30 cross-cutting / ambiguous entries" }
|
||||
t4_3 = { status = "pending", commit_sha = "", description = "Add tests/.test_durations.json to .gitignore" }
|
||||
t4_4 = { status = "pending", commit_sha = "", description = "Delete scripts/run_tests_batched.py.legacy" }
|
||||
t4_5 = { status = "pending", commit_sha = "", description = "Archive track: git mv conductor/tracks/test_batching_refactor_20260606/ conductor/tracks/archive/" }
|
||||
t4_6 = { status = "pending", commit_sha = "", description = "Update conductor/tracks.md; move entry from Backlog to Recently Completed" }
|
||||
t4_7 = { status = "pending", commit_sha = "", description = "Phase 4 checkpoint commit + git note" }
|
||||
|
||||
[verification]
|
||||
# Filled at Phase 4
|
||||
auto_classify_opt_in = false
|
||||
auto_classify_live_gui = false
|
||||
auto_classify_mock_app = false
|
||||
auto_classify_perf = false
|
||||
auto_classify_default_unit = false
|
||||
subsystem_inference_known_prefixes = false
|
||||
speed_inference_from_durations = false
|
||||
batch_group_inference = false
|
||||
merge_registry_overrides_auto = false
|
||||
categorize_all_277_files = false
|
||||
plan_unit_tier_groups_by_batch_group = false
|
||||
plan_live_gui_tier_one_invocation = false
|
||||
plan_opt_in_skipped_without_flag = false
|
||||
plan_deterministic = false
|
||||
plan_xdist_only_for_tier_1 = false
|
||||
collection_order_no_op_without_entries = false
|
||||
collection_order_sorts_by_order_index = false
|
||||
plan_matches_4at_a_time = false
|
||||
audit_exits_nonzero_on_hard_errors = false
|
||||
opt_in_skipped_without_env_var = false
|
||||
opt_in_skipped_without_include_flag = false
|
||||
no_live_gui_in_same_invocation_as_others = false
|
||||
existing_test_suite_passes = false
|
||||
test_categorizer_coverage_pct = 0
|
||||
test_batcher_coverage_pct = 0
|
||||
|
||||
[registry_overrides]
|
||||
# Populated in Phase 4 T4.2; one entry per cross-cutting or ambiguous file
|
||||
# Format: {file = "test_X.py", fixture_class = "...", subsystems = ["a", "b"], notes = "..."}
|
||||
Reference in New Issue
Block a user