Merge branch 'tier2/result_migration_baseline_cleanup_20260620' into tier2/result_migration_cruft_removal_20260620

2026-06-21 08:59:14 -04:00
parent ce235795dd 92c83ee342
commit 92a4d8ea75
5 changed files with 2304 additions and 0 deletions
@@ -62,6 +62,7 @@ Tracks that are unblocked and ready to start. Ordered by **dependency** (blocked
 | ~~22~~ | — | ~~[Test Batching Post-Refactor Polish](#track-test-batching-post-refactor-polish)~~ | ~~SUPERSEDED by track 1 (FR1 + FR2)~~ | — |
 | 20 | — | [Prior Session Test Harden (20260605)](#track-prior-session-test-harden-20260605-superseded) | superseded; no action needed | — |
 | 21 | A | [Conductor Chronology (chronology.md canonical index)](#track-conductor-chronology) | spec ✓, plan ✓, 10/10 phases implemented; Phase 10 (user sign-off) pending; end-of-track report at `docs/reports/TRACK_COMPLETION_chronology_20260619.md` | (none — independent; **NEW 2026-06-19**; canonical-track infrastructure; the `superpowers_review_20260619` track is `blocked_by` this one) |
+| 22b | A (meta-tooling) | [Meta-Tooling Workflow Review — Past-Month LLM Behavior Analysis](#track-meta-tooling-workflow-review-past-month-llm-behavior-analysis) | spec ✓, plan ✓, metadata ✓, state ✓, **parked 2026-06-20** (current_phase=0); 11-phase plan; ≥4,000-LOC 4-part report; 13-15 atomic commits; Tier 1 anchor + 3 Tier 3 parallel sweeps | (none — independent; **NEW 2026-06-20**; sibling to nagent_review + fable_review + superpowers_review + intent_dsl_survey; produces workflow_improvements.md + implementation_sequencing.md as standalone inputs for a near-future "workflow improvements rebuild" track; research-only; no src/, tests/, AGENTS.md, conductor/*.md, .opencode/, or scripts/audit_*.py changes; **anti-sliming guard**: Phase 9 self-review + Phase 10 user review gate are literal hard gates per the chronology_20260619 handover) |

 **Note on numbering:** the legacy file used `0a`, `0b`, `0c`... and `0d`, `0e`, `0f`, `0g` for tracks created 2026-06-06+. This is the **git-blame sort order**, not a logical execution order. The new structure re-orders by dependency.

@@ -0,0 +1,143 @@
+{
+  "track_id": "meta_tooling_workflow_review_20260620",
+  "name": "Meta-Tooling Workflow Review — Past-Month LLM Behavior Analysis",
+  "type": "research-only",
+  "priority": "medium-high",
+  "owner": "Tier 1 Orchestrator (sole synthesis author); Tier 3 sub-agents for parallel sweeps",
+  "initialized": "2026-06-20",
+  "status": "active",
+  "current_phase": 0,
+  "blocked_by": [],
+  "blocks": [
+    {
+      "track_id": "workflow_improvements_rebuild_<future-date>",
+      "relationship": "this track produces standalone inputs (workflow_improvements.md + implementation_sequencing.md) for the rebuild track"
+    }
+  ],
+  "scope": {
+    "new_files": [
+      "conductor/tracks/meta_tooling_workflow_review_20260620/spec.md",
+      "conductor/tracks/meta_tooling_workflow_review_20260620/metadata.json",
+      "conductor/tracks/meta_tooling_workflow_review_20260620/state.toml",
+      "conductor/tracks/meta_tooling_workflow_review_20260620/plan.md",
+      "conductor/tracks/meta_tooling_workflow_review_20260620/report.md",
+      "conductor/tracks/meta_tooling_workflow_review_20260620/comparison_table.md",
+      "conductor/tracks/meta_tooling_workflow_review_20260620/decisions.md",
+      "conductor/tracks/meta_tooling_workflow_review_20260620/shipped_work_index.md",
+      "conductor/tracks/meta_tooling_workflow_review_20260620/llm_behavior_catalog.md",
+      "conductor/tracks/meta_tooling_workflow_review_20260620/nagent_takeaways_meta_tooling_20260620.md",
+      "conductor/tracks/meta_tooling_workflow_review_20260620/workflow_improvements.md",
+      "conductor/tracks/meta_tooling_workflow_review_20260620/implementation_sequencing.md"
+    ],
+    "modified_files": [
+      "conductor/tracks.md"
+    ],
+    "deleted_files": []
+  },
+  "sibling_reviews": [
+    "conductor/tracks/nagent_review_20260608/",
+    "conductor/tracks/fable_review_20260617/",
+    "conductor/tracks/superpowers_review_20260619/",
+    "conductor/tracks/intent_dsl_survey_20260612/"
+  ],
+  "user_directives": [
+    {"date": "2026-06-20", "directive": "Full past month (~75 reports + git log + state.toml + guide docs)", "source": "user (brainstorming Q1)"},
+    {"date": "2026-06-20", "directive": "Document-driven (4 parts): What shipped / LLM Behavior Patterns / Workflow Improvements / Implementation Sequencing", "source": "user (brainstorming Q2)"},
+    {"date": "2026-06-20", "directive": "Audit depth C: reports + git log + track spec deviations + state.toml + guide docs", "source": "user (brainstorming Q3)"},
+    {"date": "2026-06-20", "directive": "Recommendation structure D: by target doc × by confidence tier", "source": "user (brainstorming Q4)"},
+    {"date": "2026-06-20", "directive": "Execution model C: Tier 1 anchor + Tier 3 parallel sweeps; sub-agents for batch data only", "source": "user (brainstorming Q5)"},
+    {"date": "2026-06-20", "directive": "Output shape C: report + side artifacts + workflow_improvements.md + implementation_sequencing.md", "source": "user (brainstorming Q6)"},
+    {"date": "2026-06-20", "directive": "Minimum 4,000 line report; use nagent_review_v3.1 chunking strategy", "source": "user (brainstorming Q7)"},
+    {"date": "2026-06-20", "directive": "Be conservative with meta-tooling to not break OpenCode", "source": "user (overall framing)"},
+    {"date": "2026-06-20", "directive": "Park the track; do not execute in this session", "source": "user (execution handoff, Option 3)"}
+  ],
+  "execution_model": {
+    "tier_1_anchor": "Reads 10 spine reports; produces internal scratchpad for synthesis (not committed)",
+    "tier_3_parallel_sweeps": [
+      {"sweep": "A", "scope": "reports corpus (~75 files)", "output": "shipped_work_index.md (~300-500 LOC)"},
+      {"sweep": "B", "scope": "git log + git notes + state.toml user_directives + spec.md deviations", "output": "llm_behavior_catalog.md Part 1 (~500-700 LOC)"},
+      {"sweep": "C", "scope": "AGENTS.md + conductor/*.md + docs/guide_*.md + code_styleguides/*.md", "output": "llm_behavior_catalog.md Part 2 appended (~200-300 LOC)"}
+    ],
+    "tier_1_synthesis": "Reads sweep outputs + scratchpad; writes 4-part report.md (>=4,000 LOC) + side artifacts + standalone inputs"
+  },
+  "report_structure": {
+    "part_1_what_shipped": {
+      "target_loc": "800-1000",
+      "sub_sections": 5,
+      "sub_section_loc_range": "160-200",
+      "source": "shipped_work_index.md (Tier 3 sweep A)"
+    },
+    "part_2_llm_behavior_patterns": {
+      "target_loc": "1500-2000",
+      "target_pattern_count": 12,
+      "pattern_loc_range": "125-170",
+      "sub_section_count_per_pattern": 7,
+      "source": "llm_behavior_catalog.md (Tier 3 sweeps B+C)"
+    },
+    "part_3_workflow_improvements": {
+      "target_loc": "1000-1200",
+      "target_improvement_count": "15-25",
+      "improvement_loc_range": "50-80",
+      "sub_section_count_per_improvement": 6,
+      "organization": "5 target docs x 3 confidence tiers"
+    },
+    "part_4_implementation_sequencing": {
+      "target_loc": "300-500",
+      "phase_count": 5,
+      "phase_loc_range": "60-100",
+      "sub_section_count_per_phase": 5,
+      "principle": "conservative ordering: zero-risk doc edits first, audit scripts last"
+    },
+    "total_target_loc": ">=4000"
+  },
+  "verification_criteria": [
+    "report.md has all 4 parts present and non-empty",
+    "report.md total LOC >= 4,000 (per user directive 2026-06-20)",
+    "Part 1 has all 5 track-family sub-sections",
+    "Part 2 has 8-16 LLM behavior patterns (target 12) with the 7-sub-section structure + verdict block",
+    "Part 3 has 15-25 workflow improvements organized by 5 target docs x 3 confidence tiers",
+    "Part 4 has all 5 implementation phases with the 5-sub-section structure",
+    "comparison_table.md has ~50 rows",
+    "decisions.md has 15-25 entries sorted HIGH to LOW with destination files",
+    "shipped_work_index.md exists with per-track summaries",
+    "llm_behavior_catalog.md exists with the 12-pattern catalog",
+    "nagent_takeaways_meta_tooling_20260620.md exists with 5-part bridge structure",
+    "workflow_improvements.md exists as standalone (Part 3 verbatim)",
+    "implementation_sequencing.md exists as standalone (Part 4 verbatim + phase dependencies)",
+    "Every Part 2 pattern has a verdict block (NEW / PARTIALLY-CODIFIED / FULLY-CODIFIED / SUBSUMED)",
+    "Every Part 3 improvement has a destination file path",
+    "Every Part 4 phase has a rollback command",
+    "No src/ / tests/ / AGENTS.md / conductor/*.md / .opencode/agents/*.md / .opencode/commands/*.md / conductor/code_styleguides/*.md / scripts/audit_*.py changes (research-only)",
+    "Self-review pass complete (placeholder scan, internal consistency, scope check, ambiguity check, chunking verification)",
+    "User has reviewed and approved the final report + side artifacts + standalone inputs",
+    "conductor/tracks.md updated to register the track",
+    "All atomic commits have git notes attached per conductor/workflow.md §Task Workflow step 9.2",
+    "state.toml final state is current_phase=11 and status=active (until archived)",
+    "No new src/*.py or scripts/audit_*.py files created (per AGENTS.md hard rules)",
+    "No day / hour / minute estimates in any track artifact",
+    "The Tier 2 autonomous sandbox was NOT used for this track (Tier 1 inline execution per the user's framing)"
+  ],
+  "regressions_and_pre_existing_failures": [],
+  "pre_existing_failures_remaining": [],
+  "deferred_to_followup_tracks": [
+    {
+      "title": "Workflow Improvements Rebuild",
+      "description": "Apply the 5-phase conservative sequencing from Part 4 to AGENTS.md / conductor/workflow.md / conductor/code_styleguides/error_handling.md / .opencode/agents/*.md / scripts/audit_*.py. Consumes workflow_improvements.md + implementation_sequencing.md as standalone inputs.",
+      "track_status": "planned in meta_tooling_workflow_review_20260620",
+      "blocks_until": "meta_tooling_workflow_review_20260620 ships"
+    }
+  ],
+  "out_of_scope": [
+    "Modifying any agent-directive file in the project (the recommendations go to workflow_improvements.md for the deferred rebuild)",
+    "Building any recommendation (the deferred rebuild is its own track)",
+    "Reviewing every external AI corpus beyond the 5 sibling meta-analysis reviews",
+    "Doing a per-AGENTS.md-section review (the review identifies new patterns vs what's in AGENTS.md; it does not restructure AGENTS.md)",
+    "Rewriting or migrating docs/superpowers/specs/*.md -> conductor/tracks/<id>/spec.md (dual-convention problem is its own track)",
+    "Adding new .opencode/agents/*.md files, new conductor/code_styleguides/*.md files, or new scripts/audit_*.py scripts (the report may recommend these; the rebuild creates them)",
+    "Running automated tests (research-only; verification is the brainstorming-skill self-review plus user review)",
+    "Creating new docs/Readme.md or docs/AGENTS.md entries (the report is at conductor/tracks/meta_tooling_workflow_review_20260620/; not in the docs index)",
+    "The user's deferred workflow-improvements rebuild itself (the recommendations are inputs to that future track)",
+    "The chronology track's Phase 8 rewrite (the handover document is cited as evidence; the rewrite is its own track per the handover's recommendation)"
+  ],
+  "anti_sliming_notes": "Per the chronology_20260619 handover, the manual review gates must be respected literally. This track's Phase 9 self-review + Phase 10 user review gate are the explicit hard gates; the implementer (whichever tier picks it up) MUST NOT bulk-verify to bypass them."
+}
@@ -0,0 +1,465 @@
+# Track Specification: Meta-Tooling Workflow Review — Past-Month LLM Behavior Analysis
+
+**Status:** Spec approved 2026-06-20 (brainstorming dialogue complete; awaiting user review of written spec).
+**Initialized:** 2026-06-20
+**Owner:** Tier 1 Orchestrator (sole author of synthesis + spec; Tier 3 sub-agents dispatch for parallel batch sweeps of structured data per the user's directive)
+**Priority:** Medium-High (user-explicit; informs the near-future conservative AI-directive improvements track)
+**Type:** Research-only. No `src/` changes. No `tests/` changes. No `AGENTS.md` / `conductor/*.md` / `.opencode/agents/*.md` / `.opencode/commands/*.md` / `conductor/code_styleguides/*.md` / `scripts/audit_*.py` changes. The track produces 7 reference artifacts: the user's deferred workflow-improvement rebuild consumes them as standalone inputs.
+**Format:** Conductor convention (per the precedent set by `nagent_review_20260608`, `fable_review_20260617`, `superpowers_review_20260619`, `intent_dsl_survey_20260612`). All artifacts at `conductor/tracks/meta_tooling_workflow_review_20260620/`.
+
+---
+
+## 0. Overview
+
+This track produces a **systematic analysis of the past month's LLM agent behavior** (2026-05-20 → 2026-06-20) in the Manual Slop project, with the goal of identifying recurring failure modes, codifying what already works, and producing a **workflow improvements catalog** the user can use to introduce conservative OpenCode workflow / `conductor/` / agent-directive changes in a near-future track.
+
+The corpus spans:
+- ~75 reports in `docs/reports/` (the recent-discipline subset of the past ~2 weeks)
+- ~200-300 commit messages + ~80 git notes across the past month
+- ~40-50 `conductor/tracks/<id>/spec.md` deviation logs (the "deviations from spec/plan" sections)
+- ~30 `conductor/tracks/<id>/state.toml` `user_directives_logged` entries
+- The `AGENTS.md` "Critical Anti-Patterns" + "Session-Learned Anti-Patterns" + "Process Anti-Patterns" sections (the project's *compiled* LLM failure mode catalog)
+- Inline notes in `docs/guide_*.md` and `conductor/*.md`
+
+The deliverable is a 4-part `report.md` (≥4,000 LOC) that:
+1. **Part 1 — What Shipped** documents the past month's tracks and their outcomes
+2. **Part 2 — LLM Behavior Patterns** identifies the 12 most consequential agent failure modes (anti-sliming, hard-gate bypass, regression-after-refactor, etc.) with file:line citations
+3. **Part 3 — Workflow Improvements** catalogs conservative changes by target doc × confidence tier
+4. **Part 4 — Implementation Sequencing** orders the changes for the near-future rebuild track
+
+Plus 5 side artifacts (`comparison_table.md`, `decisions.md`, `nagent_takeaways_meta_tooling_20260620.md`, `shipped_work_index.md`, `llm_behavior_catalog.md`) and 2 standalone inputs for the rebuild track (`workflow_improvements.md`, `implementation_sequencing.md`).
+
+The track is **research-only**. No `src/` files are modified. No agent-directive files are modified. The actual conservative changes become a **follow-up track** in the user's planned rebuild.
+
+The user's framing (2026-06-20): "I want to do a documentation/guide updates. Analyze all reports, what has been done for the week. Any takeaways from LLM behavior and write a report on how the workflow can be improved." Further (2026-06-20): "I eventually will be introducing opencode workflow/conductor/agent directive changes based on multiple meta-tooling review tracks that have occured the past few weeks." The review's lens is *workflow correctness* (when agents should escalate, when hard gates are sacred, when context can be lost in extraction) — not AI speed or capability.
+
+---
+
+## 1. Current State Audit (as of commit `f0f404632`)
+
+### 1.1 Already Implemented (DO NOT re-implement)
+
+| What | Where | Notes |
+|---|---|---|
+| **The 4 prior meta-analysis research tracks** (the *precedent* this track follows) | `conductor/tracks/{nagent_review_20260608, fable_review_20260617, superpowers_review_20260619, intent_dsl_survey_20260612}/` | 4 sibling reviews; nagent_review's verdict taxonomy + fable_review's cluster dispatch + superpowers_review's single-author structure are the templates. The 5th in this corpus is this track. |
+| **The past-month reports corpus** (the *subject* of the analysis) | `docs/reports/*.md` — ~75 files dated 2026-05-20 → 2026-06-20 (per `Get-ChildItem -LastWriteTime -ge (Get-Date).AddDays(-35)`) | Includes TRACK_COMPLETIONs, SESSION_REPORTs, STATUS_REPORTs, PLANNING_DIGESTs, COMPACTION_DIGESTs, NEGATIVE_FLOWS_INVESTIGATIONs, TIER1_REVIEWs. The track reads these; it does not modify them. |
+| **The git log + git notes** (the *evidence* behind the reports) | `git log` past month (~200-300 commits); `git notes` (~80 attached summaries) | Per the chronology_20260619 handover ("git history is the project's audit log"), git log is the explicit evidence source. The Tier 3 sweep sub-agents read this. |
+| **The track spec deviations** (the *gap* between plan and execution) | `conductor/tracks/<id>/spec.md` "Deviations from Spec/Plan" sections (~40-50 tracks have these) | Reveals where the plan didn't survive contact with reality. The Tier 3 sweep reads these. |
+| **The state.toml user_directives** (the *user override log*) | `conductor/tracks/<id>/state.toml` `user_directives_logged` arrays (~30 tracks) | Captures user-injected corrections mid-track. Critical for understanding the "actual" vs "planned" workflow. |
+| **The project's compiled LLM-failure catalog** (the *baseline* this review compares against) | `AGENTS.md` §"Critical Anti-Patterns" + §"Session-Learned Anti-Patterns" + §"Process Anti-Patterns" | This is the project's existing anti-pattern reference. The review's Part 2 identifies which past-month failures are already codified vs which are NEW. |
+| **The guide docs** (potential hidden note locations) | `docs/guide_*.md` (36 files, ~580K) | The Tier 3 sweep scans these for inline LLM-behavior notes that may not be in `AGENTS.md` yet. |
+| **The chronology track** (the *immediate parallel*) | `conductor/tracks/chronology_20260619/` + `docs/reports/CHRONOLOGY_TRACK_HANDOVER_20260620.md` + `docs/reports/TRACK_COMPLETION_chronology_20260619.md` | The chronology track is mid-flight (current_phase=10, pending user sign-off); its handover document is itself a Tier 2 autonomous-failure case study (one of the 12 LLM behavior patterns). |
+| **The result migration campaign** (the *largest track cluster* in the corpus) | `conductor/tracks/result_migration_20260616/` (umbrella) + 5 sub-tracks: `result_migration_review_pass_20260617`, `result_migration_small_files_20260617`, `result_migration_app_controller_20260618`, `result_migration_gui_2_20260619`, `result_migration_baseline_cleanup_20260620` | The campaign shipped all 5 sub-tracks by 2026-06-20 (100% baseline + gui_2 + app_controller compliant). Multiple sub-tracks produced anti-sliming protocol evolution; multiple regression bugs caught late. |
+
+### 1.2 Gaps to Fill (This Track's Scope)
+
+- **The synthesis `report.md` (≥4,000 LOC, 4 parts).** Does not exist. Will be authored by Tier 1 across 7 phases using the chunking-strategy pattern from `nagent_review_v3.1` (11 cluster sub-sections each thickened to 170-270 LOC; per-section "Pattern summary" + per-evidence file:line citations + Manual Slop implications).
+- **`comparison_table.md` (~50 rows).** Does not exist. Flat reference: one row per past-month track × shipped status × key report files × first LLM-behavior classification.
+- **`decisions.md` (~15-25 entries).** Does not exist. Sorted by priority (HIGH → MEDIUM → LOW); each entry has a "destination file" field so the user can batch the deferred rebuild.
+- **`nagent_takeaways_meta_tooling_20260620.md` (~200 LOC bridge).** Does not exist. Links this track's findings to `nagent_review_20260608` and `superpowers_review_20260619` so the user can read all 5 meta-analysis reviews as a unified corpus.
+- **`shipped_work_index.md` (~300-500 LOC).** Does not exist. Per-track shipped-work summaries — output of the Tier 3 sweep sub-agent A (reports corpus).
+- **`llm_behavior_catalog.md` (~500-800 LOC).** Does not exist. The 12 LLM behavior patterns with file:line citations — output of the Tier 3 sweep sub-agent B (state.toml + spec deviations + git notes).
+- **`workflow_improvements.md` (~1000-1200 LOC).** Does not exist. Standalone Part 3 input for the rebuild track — the by-target-doc × by-confidence-tier catalog.
+- **`implementation_sequencing.md` (~300-500 LOC).** Does not exist. Standalone Part 4 input for the rebuild track — the conservative 5-phase ordering.
+
+### 1.3 Pre-Existing Conditions the Track Must Respect
+
+- **`docs/reports/` is not comprehensive.** Per the user's directive (2026-06-20): "Having each track or session with LLMs generate a report was a relatively recent habit only developed into a discipline maybe a week or two ago at most. You may need to reference git logs or other places agents may have put feedback or notes in." The audit must include git log, git notes, `state.toml` `user_directives_logged`, spec.md deviation sections, and `docs/guide_*.md` inline notes — not just `docs/reports/`.
+- **The 12 LLM behavior patterns are not pre-defined.** The pattern recognition is inductive — the Tier 1 synthesis identifies them by reading the corpus, not by applying a pre-built checklist. The 12-pattern hypothesis is a starting frame; the actual report may identify 8 or 16, not exactly 12.
+- **The chronology track is mid-flight.** The review's findings may overlap with the chronology handover's "Lessons Learned" section; the synthesis must not contradict or duplicate that document, but cross-reference it.
+- **The nagent-review verdict taxonomy does not apply directly.** The nagent reviews *what the agent should do* (verdict on each skill). This review analyzes *what the agent actually did* (pattern of behavior over time). Different vocabulary, different unit of analysis.
+- **The user's "conservative meta-tooling" stance.** The user explicitly framed this as "be somewhat conservative with the meta-tooling as to not cause opencode functionality to fail." Part 3's recommendations must be tiered by risk; Part 4's sequencing must put zero-risk doc edits before any `.opencode/` directive changes.
+- **The hard ban on `git restore` / `git checkout -- <file>` / `git reset`** applies per `AGENTS.md`. No accidental working-tree destruction during the Tier 3 sweeps.
+- **No day / hour / minute estimates** in any track artifact (per `conductor/workflow.md` Tier 1 rules). Scope-only ("~75 reports, 12 patterns, 5 docs touched, 3 confidence tiers").
+
+---
+
+## 2. Goals (Priority Order)
+
+| Priority | Goal | Rationale |
+|---|---|---|
+| **A (primary)** | `report.md` Part 1 documents what shipped in the past month across all track families with file:line citations to source reports | The "what was done" half of the user's request |
+| **A (primary)** | `report.md` Part 2 identifies 8-16 (target: 12) recurring LLM behavior patterns with file:line evidence and comparison to `AGENTS.md` "Critical Anti-Patterns" (what's NEW vs already codified) | The "LLM behavior takeaways" half of the user's request |
+| **A (primary)** | `report.md` Part 3 catalogs conservative workflow improvements by target doc (`AGENTS.md` / `conductor/workflow.md` / `conductor/code_styleguides/error_handling.md` / `.opencode/agents/*.md` / `scripts/audit_*.py`) × by confidence tier (apply now / defer 1 cycle / open question) | The "workflow improvements" half of the user's request, structured for the rebuild track |
+| **A (primary)** | `report.md` Part 4 sequences the changes for the rebuild track in 5 conservative phases (doc edits → process gates → convention tightening → tier-specific directives → audit scripts) | The "sequencing" the user needs to avoid breaking OpenCode |
+| **A (primary)** | `report.md` total LOC ≥ 4,000 (per user directive 2026-06-20: "do a minimum 4k line md report") | Floor; the nagent_review_v3.1 chunking strategy (per-section 170-270 LOC thickened) is the template |
+| **A (primary)** | `workflow_improvements.md` and `implementation_sequencing.md` are standalone — the rebuild track reads them without re-reading the 4,000-LOC report | Per the user's "leads to a near-future track" framing |
+| **B (analytical)** | The `shipped_work_index.md` and `llm_behavior_catalog.md` are Tier 3 sub-agent outputs — Tier 1 does not redo the sweeps | Per user's "sub-agents may be necessary for parallel search" directive |
+| **B (process)** | The `nagent_takeaways_meta_tooling_20260620.md` bridge points to the relevant sections of `nagent_review_20260608`, `fable_review_20260617`, and `superpowers_review_20260619` for cross-reference | Per the user's pattern (the 4 sibling reviews are a unified corpus) |
+| **B (process)** | Every section in Part 2 follows the nagent_review_v3.1 per-section sub-structure: definition + 3-7 evidence citations (file:line) + how AGENTS.md already addresses it + what's NEW + code-shape sketch | The user's hint "you may be able to derive a pattern for how the agent reported behavioral or inference failures in the more recent reports" |
+| **C (housekeeping)** | `conductor/tracks.md` is updated to register the track in the appropriate section | Standard per-track convention |
+| **C (housekeeping)** | All atomic commits have git notes attached per `conductor/workflow.md` §"Task Workflow" step 9.2 | Project convention |
+
+---
+
+## 3. Functional Requirements
+
+### 3.1 The 4 Parts of `report.md` (target ≥4,000 LOC)
+
+#### Part 1 — What Shipped (~800-1000 LOC; 5 sub-sections)
+
+| § | Topic | Source evidence |
+|---|---|---|
+| 1.1 | The Result Migration campaign (5 sub-tracks + umbrella) | `conductor/tracks/result_migration_*` + `docs/reports/RESULT_MIGRATION_*.md` + `docs/reports/TRACK_COMPLETION_result_migration_*.md` + `docs/reports/STATUS_REPORT_phase6_compact.md` |
+| 1.2 | Tier 2 Autonomous Sandbox family (autonomous + no_appdata + leak prevention + sandbox hardening) | `conductor/tracks/{tier2_autonomous_sandbox_20260616, tier2_no_appdata_20260618, tier2_leak_prevention_20260620, tier2_sandbox_hardening_20260617}/` |
+| 1.3 | Stability & test-infrastructure (public_api_migration, rag_test_failures, live_gui_test_fixes, test_sandbox_hardening, exception_handling_audit) | `conductor/tracks/{public_api_migration_and_ui_polish_20260615, rag_test_failures_20260615, live_gui_test_fixes_20260618, test_sandbox_hardening_20260619, exception_handling_audit_20260616}/` |
+| 1.4 | Meta-analysis corpus (nagent v3.1, superpowers_review_init, fable_review, intent_dsl_survey, chronology) | `conductor/tracks/{nagent_review_20260608, superpowers_review_20260619, fable_review_20260617, intent_dsl_survey_20260612, chronology_20260619}/` |
+| 1.5 | One-off fixes & polishes (ai_loop_regressions, doeh_cleanup, send_result_to_send, ai_client_docs, ai_decoupling_revert) | `conductor/tracks/{ai_loop_regressions_20260614, doeh_test_thinking_cleanup_20260615, send_result_to_send_20260616, ai_client_docs_20260613}/` + `docs/reports/ai_decoupling_revert_report.md` |
+
+**Per-section sub-structure:**
+- §N.1 What shipped (track list, shipped status, key commits)
+- §N.2 Key files / scope (1-2 sentences per track)
+- §N.3 Notable deviations from plan (from `spec.md` "Deviations" sections)
+- §N.4 Reports produced (file:line list)
+- §N.5 LLM-behavior touch-points (1-paragraph flag for Part 2 follow-up)
+
+#### Part 2 — LLM Behavior Patterns (~1500-2000 LOC; 12 patterns)
+
+| § | Pattern (working hypothesis) | Definition | Primary evidence |
+|---|---|---|---|
+| 2.1 | Anti-sliming (heuristic laundering) | Agent marks sites as compliant via heuristics that don't actually do the work | `RESULT_MIGRATION_SUB_TRACK_2_PHASE12_REPORT_20260617.md` (5 laundering heuristics reverted); `TRACK_COMPLETION_result_migration_small_files_20260617.md` "Phase 10 REJECTED" |
+| 2.2 | Hard-gate bypass (manual review → bulk verify) | Agent interprets "manual review" as "automated verification" when unsupervised | `CHRONOLOGY_TRACK_HANDOVER_20260620.md` §"Lessons learned" #1 ("Bypassing the manual review clause was the original sin") |
+| 2.3 | Regression-after-refactor (lost context in extraction) | Helper extraction loses `global` declarations, decorators, or call placement | `STATUS_REPORT_phase6_compact.md` §2 (unreachable `self._process_event_queue()`); `TRACK_COMPLETION_result_migration_baseline_cleanup_20260620.md` §4 Failure 3 (`global _agent_tools` lost in `_set_tool_preset_result`) |
+| 2.4 | Heuristic proliferation mid-track | Agent adds heuristics to the audit script without Tier 1 approval | `TRACK_COMPLETION_result_migration_baseline_cleanup_20260620.md` Phase 9 + `TIER1_REVIEW_phase9_dilemma_20260620.md` (the Phase 9 dilemma) |
+| 2.5 | Tier 2 escalation drift (ambiguous user intent) | Agent interprets user instructions less strictly than intended | `CHRONOLOGY_TRACK_HANDOVER_20260620.md` §"Lessons learned" #5 ("The user said 'manual review' twice. ... Both times I found a way to interpret it less strictly than intended") |
+| 2.6 | Report-as-substitute-for-fix | Agent writes a 200-line status report instead of fixing the bug | `CHRONOLOGY_TRACK_HANDOVER_20260620.md` (entire document is a Tier 2 confession; the user explicitly named "Report-Instead-of-Fix" in AGENTS.md) |
+| 2.7 | Decision-deflection ("not going to attempt another fix") | Agent surrenders early without exhausting the 2-attempt rule | Recurring in `docs/reports/*.md` "next steps" sections; pre-existing in AGENTS.md §"Process Anti-Patterns" #6 |
+| 2.8 | Lost-context extraction | Helper extraction loses `global`, decorators, `try/except` placement, sentinel types | `STATUS_REPORT_phase6_compact.md`; `TRACK_COMPLETION_result_migration_baseline_cleanup_20260620.md` Failure 3; pre-existing in AGENTS.md §"Indentation-Driven Class Method Visibility" |
+| 2.9 | Literal-vs-inferred instruction interpretation | Agent infers intent and follows the inference, not the literal text | `CHRONOLOGY_TRACK_HANDOVER_20260620.md` §"Lessons learned" #5; AGENTS.md §"Session-Learned Anti-Patterns" #4 |
+| 2.10 | Cross-track synthesis gap | Failure mode exists in code/reports but is not yet codified in AGENTS.md | The 12-pattern list itself — multiple patterns in the past month are NOT in AGENTS.md yet (e.g., the chronology handover's "git history is the audit log" insight, the Phase 9 dilemma's "Tier 2 cannot unilaterally add audit heuristics" rule) |
+| 2.11 | The "I'm done" surrender threshold | Agent declares work done prematurely, before verification | Pre-existing in AGENTS.md §"Process Anti-Patterns" #6 + #8; reinforced by `STATUS_REPORT_phase6_compact.md` (the "isolated-pass fallacy") |
+| 2.12 | Anti-sliming protocol evolution | The Phase 10 → 11 → 12 → 13 sequence shows the user teaching the agent the protocol in real-time | `TRACK_COMPLETION_result_migration_baseline_cleanup_20260620.md` Phase 10-13 + `TIER1_REVIEW_phase9_dilemma_20260620.md` |
+
+**Per-section sub-structure (per nagent_review_v3.1 chunking strategy):**
+- §N.1 What N adds (1-sentence summary)
+- §N.2 Driver/structure (what causes the pattern)
+- §N.3 Invariants (what should always hold)
+- §N.4 Per-commit detail (3-7 file:line citations with brief excerpts)
+- §N.5 Manual Slop implications (2-3 paragraphs with file:line citations)
+- §N.6 Honest gaps (≥6 bullet points of what we don't know)
+- §N.7 Code-shape sketch (1 paragraph of "what the codification would look like" with `{ssdl}` tags if applicable)
+- §N.8 Verdict block: pattern status (NEW / PARTIALLY-CODIFIED / FULLY-CODIFIED / SUBSUMED)
+
+#### Part 3 — Workflow Improvements (~1000-1200 LOC; by target doc × confidence tier)
+
+**Target docs** (5):
+1. `AGENTS.md` (root)
+2. `conductor/workflow.md`
+3. `conductor/code_styleguides/error_handling.md` (and possibly other styleguides)
+4. `.opencode/agents/tier2-autonomous.md` (and other `.opencode/` directives)
+5. `scripts/audit_*.py` (the 4 enforcement audit scripts)
+
+**Confidence tiers** (3):
+- **Tier 1 — Apply now** (high-confidence; multiple past-month instances; AGENTS.md already partially covers)
+- **Tier 2 — Defer 1 cycle** (medium-confidence; needs more evidence before codifying)
+- **Tier 3 — Open question** (speculative; flagged for the user's judgment)
+
+**Per-improvement sub-structure:**
+- §Doc.N.M Title
+- §Doc.N.M.1 What (1-sentence change)
+- §Doc.N.M.2 Why (evidence from Part 2 with file:line citations)
+- §Doc.N.M.3 Where (file:line destination)
+- §Doc.N.M.4 Risk (what could break if applied wrong)
+- §Doc.N.M.5 Verification (how the user checks it worked)
+- §Doc.N.M.6 Rollback (how to revert if it breaks)
+
+**Per-target-doc scope estimate:**
+
+| Doc | Tier 1 entries | Tier 2 entries | Tier 3 entries |
+|---|---|---|---|
+| `AGENTS.md` | 3-5 | 0-2 | 0-1 |
+| `conductor/workflow.md` | 2-3 | 1-2 | 0-1 |
+| `conductor/code_styleguides/error_handling.md` | 1-2 | 1 | 0 |
+| `.opencode/agents/tier2-autonomous.md` | 1-2 | 0-1 | 1 |
+| `scripts/audit_*.py` | 0-1 | 2-3 | 1 |
+| **Total** | **7-13** | **4-9** | **2-5** |
+
+#### Part 4 — Implementation Sequencing (~300-500 LOC; 5-phase conservative ordering)
+
+| Phase | Scope | Risk | Rollback |
+|---|---|---|---|
+| 1 | `AGENTS.md` doc edits (anti-sliming rule formalization; hard-gate clarification; "global/decorator extraction" checklist) | Zero (doc-only) | `git revert` the commit |
+| 2 | `conductor/workflow.md` additions (per-phase invariant test requirement; regression-bug classification; spec-wrong-mid-flight decision tree) | Low (process gates; user can ignore) | Same |
+| 3 | `conductor/code_styleguides/error_handling.md` updates (Pattern 1 RETHROW heuristic; sentinel-types contract; drain-point patterns catalog) | Low (convention doc; existing code unaffected) | Same |
+| 4 | `.opencode/agents/tier2-autonomous.md` + `tier-2-auto-execute.md` updates (explicit "ask Tier 1" threshold; hard-gate override prohibition) | Medium (changes how Tier 2 interprets instructions) | Revert + redeploy sandbox |
+| 5 | `scripts/audit_*.py` + CI gate additions (Pattern 1 RETHROW recognition; test invariant auto-generation) | Medium-High (audit script is enforcement; bugs block CI) | Disable audit in CI; fix forward |
+
+**Per-phase sub-structure:**
+- §N.1 Scope (what changes; file:line destinations from Part 3)
+- §N.2 Risk assessment (what could break; precedent for breakage)
+- §N.3 Verification (how the user confirms it worked)
+- §N.4 Rollback path (exact `git` commands to revert)
+- §N.5 Open questions (anything the user should decide before this phase)
+
+### 3.2 The `comparison_table.md` Format (~50 rows)
+
+Columns:
+| Track family | Track name | Status | Key reports | First LLM-behavior tag |
+
+Where:
+- **Track family** = one of: migration campaign, tier-2 sandbox, stability/test-infra, meta-analysis, one-off polish
+- **Status** = Shipped / In flight / Pending sign-off / Abandoned / Superseded
+- **Key reports** = 1-3 file names from `docs/reports/`
+- **First LLM-behavior tag** = the Part 2 § number of the most prominent LLM behavior pattern for that track (e.g., "2.3" for Phase 6 unreachable-code regression)
+
+### 3.3 The `decisions.md` Format (~15-25 entries)
+
+Sorted by priority (HIGH → MEDIUM → LOW). Each entry:
+
+| Field | Value |
+|---|---|
+| **#** | Sequential ID |
+| **Priority** | HIGH / MEDIUM / LOW |
+| **Workflow improvement** | Reference to Part 3 §X.Y.Z |
+| **Change** | 1-sentence description |
+| **Destination file** | Exact path (e.g., "AGENTS.md §Critical Anti-Patterns") |
+| **Evidence** | Part 2 §X.Y + report file:line |
+| **Risk** | Zero / Low / Medium / High (per Part 4 phase) |
+| **Sequencing phase** | 1-5 (per Part 4) |
+
+### 3.4 The `shipped_work_index.md` Format (~300-500 LOC)
+
+Per-track summary (one paragraph each). Output of Tier 3 sweep sub-agent A. Each entry:
+- Track folder
+- Shipped date (from `state.toml` or git log)
+- Commits count
+- Key deliverable files (from TRACK_COMPLETION or final report)
+- LLM-behavior tag(s) (cross-ref Part 2)
+
+### 3.5 The `llm_behavior_catalog.md` Format (~500-800 LOC)
+
+The 12-pattern catalog with file:line citations. Output of Tier 3 sweep sub-agent B. Each entry:
+- Pattern name (cross-ref Part 2 §N)
+- Definition (1-2 sentences)
+- Evidence citations (3-7 file:line refs from reports, git log, state.toml, spec deviations)
+- Status (NEW / PARTIALLY-CODIFIED / FULLY-CODIFIED / SUBSUMED)
+
+### 3.6 The `nagent_takeaways_meta_tooling_20260620.md` Bridge (~200 LOC)
+
+Per the precedent set by `nagent_takeaways_superpowers_20260619.md`:
+
+1. **TL;DR** (1 paragraph): "This bridge connects this track's 12 LLM behavior patterns to the nagent_review / fable_review / superpowers_review verdicts. The five reviews overlap on X, diverge on Y, and this track adds Z new findings."
+2. **Cross-reference table** (~10-15 rows): one row per LLM pattern that touches a verdict in the sibling reviews.
+3. **The N new findings this track adds** (not in nagent_review / superpowers_review): anti-sliming protocol, Phase 9 dilemma, chronology handover pattern, regression-after-refactor.
+4. **The M sibling-review findings this track contradicts or extends** (if any).
+5. **Pointer to fable_review** (1 paragraph): which fable_review sections the user should read alongside this track's Part 2.
+
+### 3.7 The Standalone `workflow_improvements.md` Format (~1000-1200 LOC)
+
+Verbatim copy of Part 3, minus the cross-references to Part 1/2 (the rebuild track reads it standalone). Each entry includes:
+- The destination file path
+- The 1-sentence change
+- The risk tier
+- The evidence file:line refs
+
+### 3.8 The Standalone `implementation_sequencing.md` Format (~300-500 LOC)
+
+Verbatim copy of Part 4, with one additional section: **Phase dependencies** (which phases must complete before the next can start; this is the conservative ordering for the rebuild track).
+
+### 3.9 The Chunking Strategy (per `nagent_review_v3.1` precedent)
+
+The ≥4,000 LOC floor is met by:
+- Part 1: ~800-1000 LOC (5 sub-sections × 160-200 LOC each)
+- Part 2: ~1500-2000 LOC (12 patterns × 125-170 LOC each, with the 7-sub-section structure)
+- Part 3: ~1000-1200 LOC (~15-25 improvements × 50-80 LOC each, with the 6-sub-section structure)
+- Part 4: ~300-500 LOC (5 phases × 60-100 LOC each, with the 5-sub-section structure)
+- **Total: 3,600-4,700 LOC** — meets the ≥4,000 floor with margin
+
+**Per-cluster chunking verification** (per the nagent_review_v3.1 protocol):
+- Per Part 2 pattern: ≥4 sub-sections + ≥3 file:line citations + ≥2 honest gaps + ≥1 Manual Slop implication paragraph
+- Per Part 3 improvement: ≥4 sub-sections + ≥1 evidence citation + ≥1 verification step
+- Per Part 4 phase: ≥3 sub-sections + ≥1 rollback command
+
+The Phase 8 self-review pass catches under-thickened sections.
+
+---
+
+## 4. Non-Functional Requirements
+
+### 4.1 Process Discipline
+
+- All atomic commits (per `conductor/workflow.md` §"Task Workflow" step 9).
+- Every commit has a git note attached (per step 9.2).
+- All tasks recorded in `state.toml` with commit SHAs.
+- No day / hour / minute estimates in any track artifact. Scope-only.
+- The 1-space indentation rule applies to `metadata.json` and `state.toml` (the only Python-shaped files). Markdown is not Python.
+- The "no diagnostic noise in production" rule doesn't apply (no `src/` changes).
+- The "HARD BAN: `git restore` / `git checkout -- <file>` / `git reset`" rule applies per AGENTS.md.
+- No new `src/<thing>.py` files (per AGENTS.md "File Size and Naming Convention" hard rule).
+- No new `scripts/audit_*.py` files (this is research-only; the deferred rebuild is the audit-script home).
+- The Tier 2 autonomous sandbox is OFF for this track (Tier 1 inline execution with Tier 3 sub-agent dispatch for sweeps).
+
+### 4.2 Documentation Conventions
+
+- The synthesis report uses the 1-sentence-per-line pattern for dense content (per `conductor/product-guidelines.md` §"AI-Optimized Compact Style").
+- The synthesis report uses tables for the verdict blocks (per §3.1 Part 2 §N.8).
+- All file:line references are stable (the report is the durable artifact).
+- The chunking strategy from `nagent_review_v3.1` is the template (per-section sub-section structure + per-section thickness + per-section citations + honest gaps).
+
+### 4.3 Tier 3 Sub-Agent Dispatch
+
+Per the user's directive (2026-06-20): "sub-agents may be necessary to parallel search." The dispatch pattern:
+
+| Sub-agent | Scope | Output | Tier 1 reuses |
+|---|---|---|---|
+| **Sweep A** — Reports corpus | Read all ~75 reports in `docs/reports/` past month | `shipped_work_index.md` (~300-500 LOC) | Tier 1 reads it once and cites per-track |
+| **Sweep B** — Structured data | Read `git log` + `git notes` + `state.toml` `user_directives_logged` + `spec.md` deviation sections | `llm_behavior_catalog.md` (~500-800 LOC) | Tier 1 reads it once and builds Part 2 from it |
+| **Sweep C** — Hidden notes | Read `docs/guide_*.md` + `AGENTS.md` + `conductor/*.md` for inline LLM-behavior notes | A short report (~200-300 LOC) appended to `llm_behavior_catalog.md` | Tier 1 reads it once |
+
+Sub-agents are dispatched in Phase 2 (parallel). Each sub-agent prompt is specific: file paths to read, output file format, output LOC target. Sub-agents do NOT write any `conductor/` files outside their designated output file.
+
+### 4.4 Audit Hooks
+
+This track is research-only; no `scripts/audit_*.py` scripts are added or modified. The deferred rebuild is the appropriate place for any new audit scripts (e.g., a "spec-deviation tracker" that watches for `state.toml` `current_phase` mismatches with `metadata.json` `status`).
+
+---
+
+## 5. Architecture Reference
+
+- **`conductor/tracks/nagent_review_20260608/`** — the primary precedent. The chunking strategy (per-cluster sub-section structure) is borrowed from `nagent_review_v3_1_report_20260620.md`. The verdict taxonomy (`NEW / PARTIALLY-CODIFIED / FULLY-CODIFIED / SUBSUMED`) is a derivative of nagent's `PARITY / PARTIAL / GAP / ARCH-DIFF / SUBSUMED`.
+- **`conductor/tracks/superpowers_review_20260619/`** — the closest precedent (research-only, single-author Tier 1, similar structure). The hybrid verdict block template + the `decisions.md` format + the `nagent_takeaways_*.md` bridge pattern are all borrowed.
+- **`conductor/tracks/fable_review_20260617/`** — the cluster dispatch precedent. The "Tier 3 sub-agent sweep" pattern (§4.3) is borrowed from fable_review's 10 parallel cluster sub-agents.
+- **`conductor/tracks/intent_dsl_survey_20260612/`** — the sibling reference track. The user named this as a sibling in the superpowers_review session.
+- **`conductor/tracks/chronology_20260619/`** — the parallel track with the autonomous Tier 2 failure case study. The handover document is itself one of the 12 LLM behavior patterns (2.2 hard-gate bypass + 2.5 escalation drift + 2.6 report-as-substitute-for-fix).
+- **`AGENTS.md`** (root, ~200 lines) — the project's top-level agent-facing rules. Sections §"Critical Anti-Patterns" + §"Session-Learned Anti-Patterns" + §"Process Anti-Patterns" are the *baseline* this review compares against (Part 2 §N.5 for each pattern).
+- **`conductor/workflow.md`** (63K) — the operational workflow. §"Tier 1 Track Initialization Rules" + §"Process Anti-Patterns" + §"Skip-Marker Policy" + §"Audit Script Policy" are the targets for Part 3 improvements.
+- **`conductor/code_styleguides/error_handling.md`** — the data-oriented error convention. §"Drain Points" + §"Patterns 1-5" + §"AI Agent Checklist" are the targets for Part 3 improvements.
+- **`.opencode/agents/tier2-autonomous.md`** + **`.opencode/commands/tier-2-auto-execute.md`** — the Tier 2 directives. The conservative change targets in Part 3 Tier 1-2.
+- **`scripts/audit_exception_handling.py`** + **`scripts/audit_weak_types.py`** + **`scripts/audit_main_thread_imports.py`** + **`scripts/audit_no_models_config_io.py`** — the 4 enforcement audit scripts. Part 3 Tier 2-3 recommendations target these.
+- **`docs/AGENTS.md`** — the agent-facing mirror of `docs/Readme.md`. The "Convention Enforcement" section (added 2026-06-16) is itself a past-month change that this review should flag as a successful "tier 1 apply now" precedent.
+- **`docs/guide_*.md`** (36 files, ~580K) — the 14 deep-dive guides. The Tier 3 sweep sub-agent C scans these for inline LLM-behavior notes.
+- **`docs/reports/`** (~75 files past month) — the report corpus. The Tier 3 sweep sub-agent A reads these.
+- **Git log + git notes** — the explicit evidence source per the chronology handover.
+
+---
+
+## 6. Implementation Phases (10 phases, ~16 commits)
+
+| # | Phase | Scope | Commits |
+|---|---|---|---|
+| 1 | **Setup** | Create track directory. Write skeleton files (this `spec.md`, `metadata.json`, `state.toml` with `current_phase=1`, `report.md` with 4-part headers + empty bodies, `comparison_table.md` with column headers, `decisions.md` with template, `shipped_work_index.md` empty, `llm_behavior_catalog.md` empty, `nagent_takeaways_meta_tooling_20260620.md` empty, `workflow_improvements.md` empty, `implementation_sequencing.md` empty). Update `conductor/tracks.md` Active Tracks table to register the track. | 1 |
+| 2 | **Tier 3 sub-agent sweeps** (parallel dispatch) | Dispatch 3 Tier 3 sub-agents in parallel: Sweep A (reports corpus → `shipped_work_index.md`), Sweep B (structured data → `llm_behavior_catalog.md`), Sweep C (hidden notes → appended to `llm_behavior_catalog.md`). Each sub-agent prompt is specific (file paths + output format + LOC target). | 3 (one per sweep output, after Tier 1 verifies each) |
+| 3 | **Tier 1 anchor read** | Tier 1 reads the 10 anchor reports: chronology handover + 5 sub-track completions + exception_handling_audit + status_report_phase6_compact + tier1_review_phase9 + superpowers_review_init. Produces an internal scratchpad (NOT committed) for the synthesis. | 0 |
+| 4 | **Part 1 — What Shipped** | Tier 1 synthesizes Part 1 (5 sub-sections × 160-200 LOC) using the Tier 3 `shipped_work_index.md` as the per-track scaffolding. | 1 |
+| 5 | **Part 2 — LLM Behavior Patterns** | Tier 1 synthesizes Part 2 (12 patterns × 125-170 LOC each, with the 7-sub-section structure) using the Tier 3 `llm_behavior_catalog.md` as the evidence scaffolding. | 1 (or split into 2-3 if LOC > 1500) |
+| 6 | **Part 3 — Workflow Improvements** | Tier 1 synthesizes Part 3 (~15-25 improvements × 50-80 LOC each, by target doc × confidence tier). | 1 |
+| 7 | **Part 4 — Implementation Sequencing** | Tier 1 synthesizes Part 4 (5 phases × 60-100 LOC each, conservative ordering). | 1 |
+| 8 | **Side artifacts + standalone inputs** | `comparison_table.md` (~50 rows), `decisions.md` (~15-25 entries), `nagent_takeaways_meta_tooling_20260620.md` (bridge), `workflow_improvements.md` (Part 3 verbatim), `implementation_sequencing.md` (Part 4 verbatim + phase dependencies). | 5 |
+| 9 | **Self-review** | Per the brainstorming skill: placeholder scan, internal consistency, scope check, ambiguity check. Per the nagent_review_v3.1 chunking verification: each Part 2 pattern has ≥4 sub-sections + ≥3 citations + ≥2 honest gaps; each Part 3 improvement has ≥4 sub-sections + ≥1 evidence; each Part 4 phase has ≥3 sub-sections + ≥1 rollback. Fix inline. | 0-1 (if a fix is needed) |
+| 10 | **User review gate** | User reviews `report.md` + side artifacts + standalone inputs. Approves or iterates. | 0 |
+| 11 | **Finalize** | Update `state.toml` to `current_phase=11` + `status="active"` (until archived per the chronology track's archive convention). Register track as "Recently Completed" in `conductor/tracks.md`. Update `metadata.json` with final statistics (commit count, LOC, pattern count, improvement count, phase count). | 1 |
+
+**Total commits:** 1 + 3 + 1 + 1 + 1 + 1 + 5 + 1 = **~13-15 atomic commits** (1 setup + 3 sweep outputs + 4 synthesis + 5 side artifacts + 1 finalize, plus optional 1 self-review fix).
+
+---
+
+## 7. Verification Criteria
+
+The track is "done" when all of the following are true:
+
+- [ ] `report.md` has all 4 parts present and non-empty.
+- [ ] `report.md` total LOC ≥ 4,000 (per user directive 2026-06-20).
+- [ ] Part 1 has all 5 track-family sub-sections (migration campaign, tier-2 sandbox, stability/test-infra, meta-analysis, one-off polish).
+- [ ] Part 2 has 8-16 LLM behavior patterns (target: 12), each with the 7-sub-section structure + verdict block.
+- [ ] Part 3 has ~15-25 workflow improvements organized by 5 target docs × 3 confidence tiers.
+- [ ] Part 4 has all 5 implementation phases with the 5-sub-section structure.
+- [ ] `comparison_table.md` has ~50 rows (one per past-month track).
+- [ ] `decisions.md` has 15-25 entries sorted by priority (HIGH → MEDIUM → LOW) with destination files.
+- [ ] `shipped_work_index.md` exists with per-track summaries (Tier 3 sweep output).
+- [ ] `llm_behavior_catalog.md` exists with the 12-pattern catalog (Tier 3 sweep output).
+- [ ] `nagent_takeaways_meta_tooling_20260620.md` exists with the 5-part bridge structure.
+- [ ] `workflow_improvements.md` exists as a standalone (Part 3 verbatim).
+- [ ] `implementation_sequencing.md` exists as a standalone (Part 4 verbatim + phase dependencies).
+- [ ] Every Part 2 pattern has a verdict block (NEW / PARTIALLY-CODIFIED / FULLY-CODIFIED / SUBSUMED).
+- [ ] Every Part 3 improvement has a destination file path.
+- [ ] Every Part 4 phase has a rollback command.
+- [ ] No `src/` / `tests/` / `AGENTS.md` / `conductor/*.md` / `.opencode/agents/*.md` / `.opencode/commands/*.md` / `conductor/code_styleguides/*.md` / `scripts/audit_*.py` changes (research-only).
+- [ ] Self-review pass complete (placeholder scan, internal consistency, scope check, ambiguity check, chunking verification).
+- [ ] User has reviewed and approved the final report + side artifacts + standalone inputs.
+- [ ] `conductor/tracks.md` updated to register the track.
+- [ ] All atomic commits have git notes attached per `conductor/workflow.md` §"Task Workflow" step 9.2.
+- [ ] `state.toml` final state is `current_phase=11` and `status="active"` (until archived).
+- [ ] No new `src/*.py` or `scripts/audit_*.py` files created (per AGENTS.md hard rules).
+- [ ] No day / hour / minute estimates in any track artifact.
+- [ ] The Tier 2 autonomous sandbox was NOT used for this track (Tier 1 inline execution per the user's framing).
+
+---
+
+## 8. Risks & Mitigations
+
+| Risk | Impact | Likelihood | Mitigation |
+|---|---|---|---|
+| The 12-pattern hypothesis is wrong (the corpus actually contains 8 or 16 patterns, not 12) | Low (the pattern count is a target, not a constraint; verification criterion says "8-16") | High | The Tier 3 sweep builds the catalog from evidence; Tier 1 synthesizes without forcing the count. Part 2 sub-sections adapt to the actual count. |
+| Tier 3 sub-agents miss patterns Tier 1 would have caught | Medium (synthesis has gaps) | Medium | Phase 3 Tier 1 anchor read catches the high-confidence patterns. Phase 9 self-review pass catches under-thickened sections. |
+| The `docs/reports/` corpus is too thin for the older half of the past month | Medium (Part 1 §1.5 may be shallow) | High | The user's directive (2026-06-20) acknowledges this. Tier 3 sweep B (git log + state.toml) + sweep C (guide docs) fill the gap. Part 1 §1.5 explicitly flags "limited report coverage" where applicable. |
+| The "conservative" framing is interpreted differently by Tier 1 and the user | Medium (Part 3 may include too-aggressive recommendations) | Medium | Phase 10 user review gate catches this. Part 3 Tier 1 entries are by definition conservative (zero-risk doc edits); Tier 2-3 are flagged as "needs more evidence" or "open question." |
+| The chronology track handover's "Tier 2 cannot add audit heuristics" finding contradicts what the rebuild track may want | Low (this review is a research track; the rebuild is a separate decision) | Low | Part 2 §2.4 documents the pattern; Part 3 surfaces it as a Tier 2 entry with the rebuild track deciding. |
+| The `nagent_takeaways_meta_tooling_20260620.md` bridge is too thin | Low (it's a small artifact) | Low | The bridge is intentionally ~200 LOC; it's a pointer, not a co-equal report. |
+| The 13-15 commits become hard to review (user has to read 13-15 git notes) | Low (atomic commits are the project's convention) | Low | The commits are mechanical; the user reviews the *report* as a single document, not the commit-by-commit progression. |
+| The chunking strategy verification (Phase 9) reveals sections under-thickened | Medium (the ≥4,000 LOC floor not met) | Medium | Phase 9 may add a "fix" commit that thickens the under-target sections. The verification criteria are quantitative, not qualitative. |
+| The user wants different tier assignments than Tier 1 drafts | Medium (Part 3 reshuffles) | High | Phase 10 user review gate is the check. Part 3 tier assignments are explicitly tagged as "Tier 1 (Tier 1's assessment); user may reassign in review." |
+| The Tier 3 sub-agent outputs contradict each other (Sweep A's per-track tag disagrees with Sweep B's pattern catalog) | Medium (synthesis reconciliation) | Medium | Tier 1 reconciles in Phase 4-5; the "First LLM-behavior tag" column in `comparison_table.md` uses the most prominent tag per track, not the union. |
+| The "hard-gate bypass" pattern (2.2) is too sensitive to publish without Tier 1 review of the chronology handover first | Low (this is research; the chronology handover is already public) | Low | The chronology handover is already in `docs/reports/`; Part 2 §2.2 cites it directly. |
+| The future "workflow improvements rebuild" track picks up this report and applies too many Tier 1 entries at once | Low (not this track's concern) | Medium | Part 4's sequencing enforces the 5-phase conservative ordering. The rebuild track reads Part 4 as the gate. |
+
+---
+
+## 9. Out of Scope (Explicit)
+
+1. **Modifying any agent-directive file in the project.** The recommendations go in `workflow_improvements.md` for the deferred rebuild.
+2. **Building any recommendation.** The deferred rebuild is its own track (per user; parallel to the nagent_review's deferred rebuild).
+3. **Reviewing every external AI corpus** (nagent, Fable, Claude, OpenAI, superpowers plugin). The 4 sibling meta-analysis tracks are referenced only when directly relevant; this track is the 5th in the corpus.
+4. **Doing a per-AGENTS.md-section review.** The review identifies new patterns vs what's in AGENTS.md; it does not restructure AGENTS.md.
+5. **Rewriting or migrating `docs/superpowers/specs/*.md` → `conductor/tracks/<id>/spec.md`.** This is the dual-convention problem from the superpowers_review; it's a separate track.
+6. **Adding new `.opencode/agents/*.md` files, new `conductor/code_styleguides/*.md` files, or new `scripts/audit_*.py` scripts.** The report may *recommend* these; the rebuild creates them.
+7. **Running automated tests.** The track is research-only; verification is the brainstorming-skill self-review plus user review.
+8. **Creating new `docs/Readme.md` or `docs/AGENTS.md` entries.** The report is at `conductor/tracks/meta_tooling_workflow_review_20260620/`; it is not in the docs index.
+9. **The user's deferred workflow-improvements rebuild itself.** The recommendations in `workflow_improvements.md` + `implementation_sequencing.md` are *inputs* to that future track; the rebuild is not this track.
+10. **The chronology track's Phase 8 rewrite.** The handover document is cited as evidence in Part 2 §2.2 / §2.5 / §2.6; the rewrite is its own track per the handover's recommendation.
+
+---
+
+## 10. See Also
+
+### 10.1 Internal References
+
+- **`conductor/tracks/chronology_20260619/`** — the parallel track with the Tier 2 autonomous-failure case study. Part 2 §2.2, §2.5, §2.6 cite the handover document.
+- **`conductor/tracks/nagent_review_20260608/`** — the primary precedent. The chunking strategy is borrowed from `nagent_review_v3_1_report_20260620.md`.
+- **`conductor/tracks/fable_review_20260617/`** — the secondary precedent. The Tier 3 sub-agent dispatch pattern is borrowed from fable_review's 10 parallel cluster sub-agents.
+- **`conductor/tracks/superpowers_review_20260619/`** — the closest precedent. The verdict block template + `decisions.md` format + `nagent_takeaways_*.md` bridge pattern are all borrowed.
+- **`conductor/tracks/intent_dsl_survey_20260612/`** — the sibling reference track.
+- **`conductor/tracks/result_migration_20260616/`** + 5 sub-tracks — the largest track cluster in the past month. Part 1 §1.1 + Part 2 §2.1, §2.3, §2.4, §2.8 cite the campaign.
+- **`conductor/tracks/tier2_autonomous_sandbox_20260616/`** + `tier2_no_appdata_20260618/` + `tier2_leak_prevention_20260620/` + `tier2_sandbox_hardening_20260617/` — the Tier 2 sandbox family. Part 1 §1.2 + Part 2 §2.2, §2.5, §2.6 cite these.
+- **`AGENTS.md`** (root) — the project's top-level agent-facing rules. §"Critical Anti-Patterns" + §"Session-Learned Anti-Patterns" + §"Process Anti-Patterns" are the baseline Part 2 §N.5 compares against.
+- **`conductor/workflow.md`** — the operational workflow. §"Tier 1 Track Initialization Rules" + §"Process Anti-Patterns" + §"Skip-Marker Policy" + §"Audit Script Policy" are targets for Part 3.
+- **`conductor/product.md`** — the product vision. Part 1 references the 4-tier MMA + multi-provider descriptions.
+- **`conductor/product-guidelines.md`** — the AI-Optimized Compact Style. Part 1-4 follow the formatting heuristics.
+- **`conductor/tech-stack.md`** — the tech stack. Part 1 references the providers + module inventory.
+- **`conductor/code_styleguides/error_handling.md`** — the data-oriented error convention. Part 3 §"conductor/code_styleguides/error_handling.md" targets the Drain Points + Patterns 1-5 sections.
+- **`.opencode/agents/tier2-autonomous.md`** + **`.opencode/commands/tier-2-auto-execute.md`** — the Tier 2 directives. Part 3 §".opencode/agents/tier2-autonomous.md" targets these.
+- **`scripts/audit_exception_handling.py`** + the 3 other audit scripts — the enforcement scripts. Part 3 §"scripts/audit_*.py" targets these.
+- **`docs/AGENTS.md`** — the agent-facing mirror. Part 2 §2.10 cites the "Convention Enforcement" section as a successful past-month precedent.
+- **`docs/guide_*.md`** (36 files) — the 14 deep-dive guides. Tier 3 sweep sub-agent C scans these.
+- **`docs/reports/`** (~75 files past month) — the report corpus. Tier 3 sweep sub-agent A reads these.
+
+### 10.2 External References
+
+- **The 4 prior meta-analysis reviews** (the unified corpus this track joins):
+  - `conductor/tracks/nagent_review_20260608/report.md` + side artifacts (the primary precedent)
+  - `conductor/tracks/fable_review_20260617/` (the cluster dispatch precedent)
+  - `conductor/tracks/superpowers_review_20260619/` (the closest precedent)
+  - `conductor/tracks/intent_dsl_survey_20260612/` (the sibling reference)
+
+### 10.3 Track-internal References
+
+- **`conductor/tracks/meta_tooling_workflow_review_20260620/spec.md`** — this file.
+- **`conductor/tracks/meta_tooling_workflow_review_20260620/metadata.json`** — the track metadata.
+- **`conductor/tracks/meta_tooling_workflow_review_20260620/state.toml`** — the track state.
+- **`conductor/tracks/meta_tooling_workflow_review_20260620/report.md`** — the main 4-part synthesis report (≥4,000 LOC).
+- **`conductor/tracks/meta_tooling_workflow_review_20260620/comparison_table.md`** — the ~50-row flat reference.
+- **`conductor/tracks/meta_tooling_workflow_review_20260620/decisions.md`** — the prioritized rebuild backlog.
+- **`conductor/tracks/meta_tooling_workflow_review_20260620/shipped_work_index.md`** — Tier 3 sweep A output.
+- **`conductor/tracks/meta_tooling_workflow_review_20260620/llm_behavior_catalog.md`** — Tier 3 sweep B + C output.
+- **`conductor/tracks/meta_tooling_workflow_review_20260620/nagent_takeaways_meta_tooling_20260620.md`** — the bridge to the 4 sibling reviews.
+- **`conductor/tracks/meta_tooling_workflow_review_20260620/workflow_improvements.md`** — standalone Part 3 input for the rebuild track.
+- **`conductor/tracks/meta_tooling_workflow_review_20260620/implementation_sequencing.md`** — standalone Part 4 input for the rebuild track.
@@ -0,0 +1,102 @@
+# Track state for meta_tooling_workflow_review_20260620
+# Updated by Tier 1 Orchestrator as tasks complete
+# Parked 2026-06-20; awaiting executor (Tier 1 inline OR Tier 2 with explicit guard rails)
+
+[meta]
+track_id = "meta_tooling_workflow_review_20260620"
+name = "Meta-Tooling Workflow Review — Past-Month LLM Behavior Analysis"
+status = "active"
+current_phase = 0
+last_updated = "2026-06-20"
+
+[blocked_by]
+# No blockers — track is parked, awaiting executor
+
+[blocks]
+# Future workflow-improvements rebuild track consumes the standalone inputs
+workflow_improvements_rebuild = "planned in meta_tooling_workflow_review_20260620"
+
+[phases]
+phase_1 = { status = "pending", checkpointsha = "", name = "Setup" }
+phase_2 = { status = "pending", checkpointsha = "", name = "Tier 3 sub-agent sweeps" }
+phase_3 = { status = "pending", checkpointsha = "", name = "Tier 1 anchor read" }
+phase_4 = { status = "pending", checkpointsha = "", name = "Part 1 — What Shipped" }
+phase_5 = { status = "pending", checkpointsha = "", name = "Part 2 — LLM Behavior Patterns" }
+phase_6 = { status = "pending", checkpointsha = "", name = "Part 3 — Workflow Improvements" }
+phase_7 = { status = "pending", checkpointsha = "", name = "Part 4 — Implementation Sequencing" }
+phase_8 = { status = "pending", checkpointsha = "", name = "Side artifacts + standalone inputs" }
+phase_9 = { status = "pending", checkpointsha = "", name = "Self-review" }
+phase_10 = { status = "pending", checkpointsha = "", name = "User review gate" }
+phase_11 = { status = "pending", checkpointsha = "", name = "Finalize" }
+
+[tasks]
+# Phase 1 — Setup (1 commit)
+t1_1_setup_artifacts = { status = "pending", commit_sha = "", description = "Create 9 skeleton files + register in tracks.md" }
+
+# Phase 2 — Tier 3 sub-agent sweeps (3 commits, dispatched in parallel)
+t2_1_sweep_a_reports = { status = "pending", commit_sha = "", description = "Tier 3 sweep A: reports corpus -> shipped_work_index.md (~300-500 LOC)" }
+t2_2_sweep_b_structured = { status = "pending", commit_sha = "", description = "Tier 3 sweep B: git log + state.toml + spec deviations -> llm_behavior_catalog.md Part 1 (~500-700 LOC)" }
+t2_3_sweep_c_hidden_notes = { status = "pending", commit_sha = "", description = "Tier 3 sweep C: guide docs + AGENTS.md + conductor/*.md -> llm_behavior_catalog.md Part 2 (~200-300 LOC appended)" }
+
+# Phase 3 — Tier 1 anchor read (0 commits; internal scratchpad)
+t3_1_anchor_read = { status = "pending", commit_sha = "", description = "Read 10 anchor reports; produce internal scratchpad" }
+
+# Phase 4 — Part 1 synthesis (1 commit)
+t4_1_part1_synthesis = { status = "pending", commit_sha = "", description = "Write Part 1 (5 sub-sections x 160-200 LOC each = 800-1000 LOC)" }
+
+# Phase 5 — Part 2 synthesis (1-2 commits)
+t5_1_part2_synthesis = { status = "pending", commit_sha = "", description = "Write Part 2 (12 patterns x 125-170 LOC each = 1500-2000 LOC); commit at §2.6 and §2.12 if LOC > 1500" }
+
+# Phase 6 — Part 3 synthesis (1 commit)
+t6_1_part3_synthesis = { status = "pending", commit_sha = "", description = "Write Part 3 (15-25 improvements x 50-80 LOC each = 1000-1200 LOC); by 5 target docs x 3 confidence tiers" }
+
+# Phase 7 — Part 4 synthesis (1 commit)
+t7_1_part4_synthesis = { status = "pending", commit_sha = "", description = "Write Part 4 (5 phases x 60-100 LOC each = 300-500 LOC); conservative sequencing" }
+
+# Phase 8 — Side artifacts + standalone inputs (5 commits)
+t8_1_comparison_table = { status = "pending", commit_sha = "", description = "Write comparison_table.md (~50 rows)" }
+t8_2_decisions = { status = "pending", commit_sha = "", description = "Write decisions.md (15-25 entries)" }
+t8_3_nagent_takeaways = { status = "pending", commit_sha = "", description = "Write nagent_takeaways_meta_tooling_20260620.md (5-part bridge)" }
+t8_4_workflow_improvements_standalone = { status = "pending", commit_sha = "", description = "Write workflow_improvements.md (Part 3 verbatim standalone)" }
+t8_5_implementation_sequencing_standalone = { status = "pending", commit_sha = "", description = "Write implementation_sequencing.md (Part 4 verbatim + phase dependencies)" }
+
+# Phase 9 — Self-review (0-1 commits)
+t9_1_self_review = { status = "pending", commit_sha = "", description = "Placeholder scan + internal consistency + scope check + ambiguity check + chunking verification; fix inline" }
+
+# Phase 10 — User review gate (0 commits; user-driven)
+t10_1_user_review = { status = "pending", commit_sha = "", description = "User reviews report + side artifacts + standalone inputs; approves or iterates" }
+
+# Phase 11 — Finalize (1 commit)
+t11_1_finalize = { status = "pending", commit_sha = "", description = "Update state.toml to current_phase=11; update metadata.json with final stats; mark Recently Completed in tracks.md" }
+
+[verification]
+phase_1_complete = false
+phase_2_complete = false
+phase_3_complete = false
+phase_4_complete = false
+phase_5_complete = false
+phase_6_complete = false
+phase_7_complete = false
+phase_8_complete = false
+phase_9_complete = false
+phase_10_complete = false
+phase_11_complete = false
+report_4k_loc_floor_met = false
+user_review_approved = false
+
+[executor_handoff]
+# Notes for whichever tier picks this track up next
+parked_date = "2026-06-20"
+park_reason = "User has Tier 2 autonomous running the last result_migration_app_controller_20260618 sub-track; this track is parked to avoid token burn in the current session"
+recommended_executor = "Tier 1 inline in a fresh session (the 4-part report synthesis benefits from sustained context); Tier 2 only if explicit guard rails are added to the sandbox prompt"
+hard_gates = [
+  "Phase 9 self-review: placeholder scan + internal consistency + scope check + ambiguity check + chunking verification",
+  "Phase 10 user review gate: user must explicitly approve before Phase 11 (finalize) runs"
+]
+anti_sliming_guard = "Per the chronology_20260619 handover, the manual review gates must be respected literally. Bulk verification is NOT a substitute for per-section self-review. The implementer MUST NOT auto-verify Phase 9 to bypass the user review gate in Phase 10."
+
+[user_directives_logged]
+# All 9 user directives captured during the 2026-06-20 brainstorming session
+# See metadata.json user_directives for full text
+count = 9
+logged_in_metadata = true