Merge remote-tracking branch 'tier2-clone/tier2/post_module_taxonomy_de_cruft_20260627' into tier2/post_module_taxonomy_de_cruft_20260627
This commit is contained in:
@@ -1,23 +0,0 @@
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
def run_diag(role: str, prompt: str) -> str:
|
||||
print(f"--- Running Diag for {role} ---")
|
||||
cmd = [sys.executable, "scripts/mma_exec.py", "--role", role, prompt]
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, encoding='utf-8')
|
||||
print("STDOUT:")
|
||||
print(result.stdout)
|
||||
print("STDERR:")
|
||||
print(result.stderr)
|
||||
return result.stdout
|
||||
except Exception as e:
|
||||
print(f"FAILED: {e}")
|
||||
return str(e)
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Test 1: Simple read
|
||||
print("TEST 1: read_file")
|
||||
run_diag("tier3-worker", "Read the file 'pyproject.toml' and tell me the version of the project. ONLY the version string.")
|
||||
print("\nTEST 2: run_shell_command")
|
||||
run_diag("tier3-worker", "Use run_shell_command to execute 'echo HELLO_SUBAGENT' and return the output. ONLY the output.")
|
||||
@@ -1,64 +0,0 @@
|
||||
import unittest
|
||||
from unittest.mock import MagicMock, patch
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Ensure project root is in path so we can import src.gui_2
|
||||
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
|
||||
if project_root not in sys.path:
|
||||
sys.path.insert(0, project_root)
|
||||
|
||||
class TestMarkdownTableWidth(unittest.TestCase):
|
||||
def test_render_discussion_entry_full_width(self):
|
||||
"""
|
||||
Verify that render_discussion_entry calls imgui.dummy with the full available width.
|
||||
"""
|
||||
# Mock all dependencies to avoid side effects and complex setup during import/execution
|
||||
with patch('src.gui_2.imgui') as mock_imgui, \
|
||||
patch('src.gui_2.imscope') as mock_imscope, \
|
||||
patch('src.gui_2.theme') as mock_theme, \
|
||||
patch('src.gui_2.project_manager') as mock_pm, \
|
||||
patch('src.gui_2.render_thinking_trace') as mock_rtt, \
|
||||
patch('src.gui_2.render_discussion_entry_read_mode') as mock_rderm:
|
||||
|
||||
# 1. Setup available width and coordinates
|
||||
expected_width = 850.0
|
||||
mock_avail = MagicMock()
|
||||
mock_avail.x = expected_width
|
||||
mock_imgui.get_content_region_avail.return_value = mock_avail
|
||||
|
||||
# Mock ImVec2 to return a simple tuple for easier assertion
|
||||
mock_imgui.ImVec2.side_effect = lambda x, y: (x, y)
|
||||
|
||||
# 3. Mock app and entry state
|
||||
mock_app = MagicMock()
|
||||
mock_app.disc_roles = ["User", "Assistant"]
|
||||
|
||||
entry = {
|
||||
"role": "User",
|
||||
"content": "Hello world",
|
||||
"collapsed": False,
|
||||
"read_mode": False
|
||||
}
|
||||
|
||||
# Mock interactive elements
|
||||
mock_imgui.begin_combo.return_value = False
|
||||
mock_imgui.button.return_value = False
|
||||
mock_imgui.input_text_multiline.return_value = (False, entry["content"])
|
||||
|
||||
# 4. Import the function within the patch context
|
||||
from src.gui_2 import render_discussion_entry
|
||||
|
||||
# 5. Execute the function
|
||||
render_discussion_entry(mock_app, entry, 0)
|
||||
|
||||
# 6. Verification
|
||||
# The function should call imgui.dummy(imgui.ImVec2(full_width, 0))
|
||||
mock_imgui.dummy.assert_any_call((expected_width, 0.0))
|
||||
|
||||
# CRITICAL: Verify newline or spacing is called to prevent squashing
|
||||
# We expect this to fail currently
|
||||
assert mock_imgui.new_line.called or mock_imgui.spacing.called
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -1,33 +0,0 @@
|
||||
import inspect
|
||||
import sys
|
||||
import os
|
||||
import pytest
|
||||
|
||||
# Ensure project root is in path
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
||||
|
||||
def test_gui_monolithic_symbols():
|
||||
try:
|
||||
from src.gui_2 import App, render_discussion_entry, render_thinking_trace
|
||||
import src.gui_2
|
||||
except ImportError as e:
|
||||
pytest.fail(f"FAILURE: Could not import from src.gui_2: {e}")
|
||||
|
||||
# Verify App is importable
|
||||
assert App is not None
|
||||
|
||||
# Verify render_discussion_entry is in src.gui_2
|
||||
assert hasattr(src.gui_2, 'render_discussion_entry'), "render_discussion_entry missing from src.gui_2"
|
||||
|
||||
# Verify it's defined in src.gui_2, not imported
|
||||
mod = inspect.getmodule(render_discussion_entry)
|
||||
assert mod is not None, "Could not determine module for render_discussion_entry"
|
||||
assert mod.__name__ == 'src.gui_2', f"render_discussion_entry expected in src.gui_2, but found in {mod.__name__}"
|
||||
|
||||
# Verify render_thinking_trace is in src.gui_2
|
||||
assert hasattr(src.gui_2, 'render_thinking_trace'), "render_thinking_trace missing from src.gui_2"
|
||||
|
||||
# Verify it's defined in src.gui_2, not imported
|
||||
mod = inspect.getmodule(render_thinking_trace)
|
||||
assert mod is not None, "Could not determine module for render_thinking_trace"
|
||||
assert mod.__name__ == 'src.gui_2', f"render_thinking_trace expected in src.gui_2, but found in {mod.__name__}"
|
||||
@@ -1,29 +0,0 @@
|
||||
import pytest
|
||||
from unittest.mock import patch, MagicMock
|
||||
from src.imgui_scopes import _ScopeId
|
||||
import src.imgui_scopes as imgui_scopes
|
||||
|
||||
def test_scope_id_string():
|
||||
with patch('src.imgui_scopes.imgui') as mock_imgui:
|
||||
sid = _ScopeId("test_id")
|
||||
with sid:
|
||||
pass
|
||||
mock_imgui.push_id.assert_called_once_with("test_id")
|
||||
mock_imgui.pop_id.assert_called_once()
|
||||
|
||||
def test_scope_id_int():
|
||||
with patch('src.imgui_scopes.imgui') as mock_imgui:
|
||||
# Python type hint is str, but we test runtime resilience
|
||||
sid = _ScopeId(1234)
|
||||
with sid:
|
||||
pass
|
||||
# Verify it was converted to string to prevent low-level crashes
|
||||
mock_imgui.push_id.assert_called_once_with("1234")
|
||||
mock_imgui.pop_id.assert_called_once()
|
||||
|
||||
def test_id_helper_function():
|
||||
with patch('src.imgui_scopes.imgui') as mock_imgui:
|
||||
with imgui_scopes.id(42):
|
||||
pass
|
||||
mock_imgui.push_id.assert_called_once_with("42")
|
||||
mock_imgui.pop_id.assert_called_once()
|
||||
@@ -1,60 +0,0 @@
|
||||
import subprocess
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
def run_ps_script(role: str, prompt: str) -> subprocess.CompletedProcess:
|
||||
"""Helper to run the run_subagent.ps1 script."""
|
||||
# Using -File is safer and handles arguments better
|
||||
cmd = [
|
||||
"powershell", "-NoProfile", "-ExecutionPolicy", "Bypass",
|
||||
"-File", "./scripts/run_subagent.ps1",
|
||||
"-Role", role,
|
||||
"-Prompt", prompt
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.stdout:
|
||||
print(f"\n[Sub-Agent {role} Output]:\n{result.stdout}")
|
||||
if result.stderr:
|
||||
print(f"\n[Sub-Agent {role} Error]:\n{result.stderr}")
|
||||
return result
|
||||
|
||||
@patch('subprocess.run')
|
||||
def test_subagent_script_qa_live(mock_run) -> None:
|
||||
"""Verify that the QA role works and returns a compressed fix."""
|
||||
mock_run.return_value = MagicMock(returncode=0, stdout='Fix the division by zero error.', stderr='')
|
||||
prompt = "Traceback (most recent call last): File 'test.py', line 1, in <module> 1/0 ZeroDivisionError: division by zero"
|
||||
result = run_ps_script("QA", prompt)
|
||||
assert result.returncode == 0
|
||||
# Expected output should mention the fix for division by zero
|
||||
assert "zero" in result.stdout.lower()
|
||||
# It should be short (QA agents compress)
|
||||
assert len(result.stdout.split()) < 40
|
||||
|
||||
@patch('subprocess.run')
|
||||
def test_subagent_script_worker_live(mock_run) -> None:
|
||||
"""Verify that the Worker role works and returns code."""
|
||||
mock_run.return_value = MagicMock(returncode=0, stdout='def hello(): return "hello world"', stderr='')
|
||||
prompt = "Write a python function that returns 'hello world'"
|
||||
result = run_ps_script("Worker", prompt)
|
||||
assert result.returncode == 0
|
||||
assert "def" in result.stdout.lower()
|
||||
assert "hello" in result.stdout.lower()
|
||||
|
||||
@patch('subprocess.run')
|
||||
def test_subagent_script_utility_live(mock_run) -> None:
|
||||
"""Verify that the Utility role works."""
|
||||
mock_run.return_value = MagicMock(returncode=0, stdout='True', stderr='')
|
||||
prompt = "Tell me 'True' if 1+1=2, otherwise 'False'"
|
||||
result = run_ps_script("Utility", prompt)
|
||||
assert result.returncode == 0
|
||||
assert "true" in result.stdout.lower()
|
||||
|
||||
@patch('subprocess.run')
|
||||
def test_subagent_isolation_live(mock_run) -> None:
|
||||
"""Verify that the sub-agent is stateless and does not see the parent's conversation context."""
|
||||
mock_run.return_value = MagicMock(returncode=0, stdout='UNKNOWN', stderr='')
|
||||
# This prompt asks the sub-agent about a 'secret' mentioned only here, not in its prompt.
|
||||
prompt = "What is the secret code I just told you? If I didn't tell you, say 'UNKNOWN'."
|
||||
result = run_ps_script("Utility", prompt)
|
||||
assert result.returncode == 0
|
||||
# A stateless agent should not know any previous context.
|
||||
assert "unknown" in result.stdout.lower()
|
||||
@@ -1,140 +0,0 @@
|
||||
import pytest
|
||||
import os
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, MagicMock
|
||||
from scripts.mma_exec import create_parser, get_role_documents, execute_agent, get_model_for_role, get_dependencies
|
||||
|
||||
def test_parser_role_choices() -> None:
|
||||
"""Test that the parser accepts valid roles and the prompt argument."""
|
||||
parser = create_parser()
|
||||
valid_roles = ['tier1', 'tier2', 'tier3', 'tier4']
|
||||
test_prompt = "Analyze the codebase for bottlenecks."
|
||||
for role in valid_roles:
|
||||
args = parser.parse_args(['--role', role, test_prompt])
|
||||
assert args.role == role
|
||||
assert args.prompt == test_prompt
|
||||
|
||||
def test_parser_invalid_role() -> None:
|
||||
"""Test that the parser rejects roles outside the specified choices."""
|
||||
parser = create_parser()
|
||||
with pytest.raises(SystemExit):
|
||||
parser.parse_args(['--role', 'tier5', 'Some prompt'])
|
||||
|
||||
def test_parser_prompt_optional() -> None:
|
||||
"""Test that the prompt argument is optional if role is provided (or handled in main)."""
|
||||
parser = create_parser()
|
||||
# Prompt is now optional (nargs='?')
|
||||
args = parser.parse_args(['--role', 'tier3'])
|
||||
assert args.role == 'tier3'
|
||||
assert args.prompt is None
|
||||
|
||||
def test_parser_help() -> None:
|
||||
"""Test that the help flag works without raising errors (exits with 0)."""
|
||||
parser = create_parser()
|
||||
with pytest.raises(SystemExit) as excinfo:
|
||||
parser.parse_args(['--help'])
|
||||
assert excinfo.value.code == 0
|
||||
|
||||
def test_get_role_documents() -> None:
|
||||
"""Test that get_role_documents returns the correct documentation paths for each tier."""
|
||||
assert get_role_documents('tier1') == ['conductor/product.md', 'conductor/product-guidelines.md', 'docs/guide_architecture.md', 'docs/guide_mma.md']
|
||||
assert get_role_documents('tier2') == ['conductor/tech-stack.md', 'conductor/workflow.md', 'docs/guide_architecture.md', 'docs/guide_mma.md']
|
||||
assert get_role_documents('tier3') == ['docs/guide_architecture.md']
|
||||
assert get_role_documents('tier4') == ['docs/guide_architecture.md']
|
||||
|
||||
def test_get_model_for_role() -> None:
|
||||
"""Test that get_model_for_role returns the correct model for each role."""
|
||||
assert get_model_for_role('tier1-orchestrator') == 'gemini-3.1-pro-preview'
|
||||
assert get_model_for_role('tier2-tech-lead') == 'gemini-3-flash-preview'
|
||||
assert get_model_for_role('tier3-worker') == 'gemini-3-flash-preview'
|
||||
assert get_model_for_role('tier4-qa') == 'gemini-2.5-flash-lite'
|
||||
|
||||
def test_execute_agent() -> None:
|
||||
"""
|
||||
Test that execute_agent calls subprocess.run with powershell and the correct gemini CLI arguments
|
||||
including the model specified for the role.
|
||||
"""
|
||||
role = "tier3-worker"
|
||||
prompt = "Write a unit test."
|
||||
docs = ["file1.py", "docs/spec.md"]
|
||||
expected_model = "gemini-3-flash-preview"
|
||||
mock_stdout = "Mocked AI Response"
|
||||
with patch("subprocess.run") as mock_run:
|
||||
mock_process = MagicMock()
|
||||
mock_process.stdout = mock_stdout
|
||||
mock_process.returncode = 0
|
||||
mock_run.return_value = mock_process
|
||||
result = execute_agent(role, prompt, docs)
|
||||
mock_run.assert_called_once()
|
||||
args, kwargs = mock_run.call_args
|
||||
cmd_list = args[0]
|
||||
assert cmd_list[0] == "powershell.exe"
|
||||
assert "-Command" in cmd_list
|
||||
ps_cmd = cmd_list[cmd_list.index("-Command") + 1]
|
||||
assert "gemini" in ps_cmd
|
||||
assert f"--model {expected_model}" in ps_cmd
|
||||
# Verify input contains the prompt and system directive
|
||||
input_text = kwargs.get("input")
|
||||
assert "STRICT SYSTEM DIRECTIVE" in input_text
|
||||
assert "TASK: Write a unit test." in input_text
|
||||
assert kwargs.get("capture_output") is True
|
||||
assert kwargs.get("text") is True
|
||||
assert result == mock_stdout
|
||||
|
||||
def test_get_dependencies(tmp_path: Path) -> None:
|
||||
content = (
|
||||
"import os\n"
|
||||
"import sys\n"
|
||||
"import file_cache\n"
|
||||
"from mcp_client import something\n"
|
||||
)
|
||||
filepath = tmp_path / "mock_script.py"
|
||||
filepath.write_text(content)
|
||||
dependencies = get_dependencies(str(filepath))
|
||||
assert dependencies == ['os', 'sys', 'file_cache', 'mcp_client']
|
||||
|
||||
import re
|
||||
|
||||
def test_execute_agent_logging(tmp_path: Path) -> None:
|
||||
log_file = tmp_path / "mma_delegation.log"
|
||||
# mma_exec now uses logs/agents/ for individual logs and logs/mma_delegation.log for master
|
||||
# We will patch LOG_FILE to point to our temp location
|
||||
with patch("scripts.mma_exec.LOG_FILE", str(log_file)), \
|
||||
patch("subprocess.run") as mock_run:
|
||||
mock_process = MagicMock()
|
||||
mock_process.stdout = ""
|
||||
mock_process.returncode = 0
|
||||
mock_run.return_value = mock_process
|
||||
test_role = "tier1"
|
||||
test_prompt = "Plan the next phase"
|
||||
execute_agent(test_role, test_prompt, [])
|
||||
assert log_file.exists()
|
||||
log_content = log_file.read_text()
|
||||
assert test_role in log_content
|
||||
assert test_prompt in log_content # Master log should now have the summary prompt
|
||||
assert re.search(r"\d{4}-\d{2}-\d{2}", log_content)
|
||||
|
||||
def test_execute_agent_tier3_injection(tmp_path: Path) -> None:
|
||||
main_content = "import dependency\n\ndef run():\n dependency.do_work()\n"
|
||||
main_file = tmp_path / "main.py"
|
||||
main_file.write_text(main_content)
|
||||
dep_content = "def do_work():\n pass\n\ndef other_func():\n print('hello')\n"
|
||||
dep_file = tmp_path / "dependency.py"
|
||||
dep_file.write_text(dep_content)
|
||||
# We need to ensure generate_skeleton is mockable or working
|
||||
old_cwd = os.getcwd()
|
||||
os.chdir(tmp_path)
|
||||
try:
|
||||
with patch("subprocess.run") as mock_run:
|
||||
mock_process = MagicMock()
|
||||
mock_process.stdout = "OK"
|
||||
mock_process.returncode = 0
|
||||
mock_run.return_value = mock_process
|
||||
execute_agent('tier3-worker', 'Modify main.py', ['main.py'])
|
||||
assert mock_run.called
|
||||
input_text = mock_run.call_args[1].get("input")
|
||||
assert "DEPENDENCY SKELETON: dependency.py" in input_text
|
||||
assert "def do_work():" in input_text
|
||||
assert "Modify main.py" in input_text
|
||||
finally:
|
||||
os.chdir(old_cwd)
|
||||
@@ -1,40 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Add src to path
|
||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
|
||||
|
||||
from src.history import HistoryManager
|
||||
|
||||
def verify_phase_1():
|
||||
print("Verifying Phase 1: History Core Logic...")
|
||||
hm = HistoryManager(max_capacity=10)
|
||||
|
||||
# Test push
|
||||
hm.push({"test": 1}, "initial")
|
||||
if not hm.can_undo:
|
||||
print("Error: can_undo should be true after push")
|
||||
sys.exit(1)
|
||||
|
||||
# Test undo
|
||||
entry = hm.undo({"test": 2}, "current")
|
||||
if entry.state != {"test": 1}:
|
||||
print(f"Error: expected state {{'test': 1}}, got {entry.state}")
|
||||
sys.exit(1)
|
||||
if entry.description != "initial":
|
||||
print(f"Error: expected description 'initial', got {entry.description}")
|
||||
sys.exit(1)
|
||||
|
||||
# Test redo
|
||||
entry = hm.redo({"test": 1}, "back")
|
||||
if entry.state != {"test": 2}:
|
||||
print(f"Error: expected state {{'test': 2}}, got {entry.state}")
|
||||
sys.exit(1)
|
||||
if entry.description != "current":
|
||||
print(f"Error: expected description 'current', got {entry.description}")
|
||||
sys.exit(1)
|
||||
|
||||
print("Phase 1 verification PASSED.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
verify_phase_1()
|
||||
@@ -1,24 +0,0 @@
|
||||
import subprocess
|
||||
import sys
|
||||
import os
|
||||
|
||||
def verify_phase_2():
|
||||
print("Verifying Phase 2: Text Input & Control Undo/Redo...")
|
||||
|
||||
# Run the simulation test
|
||||
result = subprocess.run(
|
||||
["uv", "run", "pytest", "tests/test_undo_redo_sim.py"],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
print("Phase 2 verification PASSED.")
|
||||
else:
|
||||
print("Phase 2 verification FAILED.")
|
||||
print(result.stdout)
|
||||
print(result.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
verify_phase_2()
|
||||
@@ -1,24 +0,0 @@
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
def verify_phase_3():
|
||||
print("Verifying Phase 3: GUI Menu Integration...")
|
||||
|
||||
# We rely on the existing simulation test to verify the callback logic,
|
||||
# which underpins the GUI menu integration.
|
||||
result = subprocess.run(
|
||||
["uv", "run", "pytest", "tests/test_workspace_profiles_sim.py"],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
print("Phase 3 verification PASSED.")
|
||||
else:
|
||||
print("Phase 3 verification FAILED.")
|
||||
print(result.stdout)
|
||||
print(result.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
verify_phase_3()
|
||||
@@ -1,23 +0,0 @@
|
||||
import subprocess
|
||||
import sys
|
||||
import os
|
||||
|
||||
def verify_phase_4():
|
||||
print("Verifying Phase 4: Contextual Auto-Switch...")
|
||||
|
||||
result = subprocess.run(
|
||||
["uv", "run", "pytest", "tests/test_auto_switch_sim.py"],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
print("Phase 4 verification PASSED.")
|
||||
else:
|
||||
print("Phase 4 verification FAILED.")
|
||||
print(result.stdout)
|
||||
print(result.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
verify_phase_4()
|
||||
@@ -21,6 +21,8 @@ permission:
|
||||
"git reset*": deny
|
||||
---
|
||||
|
||||
Note: You may use superpowers skills to assist you (brainstorming, recieving code reviews, writing plans, writting skills, dispatching parallel agents)
|
||||
|
||||
STRICT SYSTEM DIRECTIVE: You are a Tier 2 Tech Lead in AUTONOMOUS mode, running in the **META-TOOLING** domain (per `docs/guide_meta_boundary.md`). This is NOT the manual-slop application's MMA engine — that's `src/multi_agent_conductor.py` in the APPLICATION domain. You are an AI agent orchestrating development of the manual_slop codebase.
|
||||
|
||||
## MANDATORY: Domain Distinction (added 2026-06-27)
|
||||
@@ -115,6 +117,8 @@ These are all attempts to rewrite history. They are BANNED. The right answer is
|
||||
## Conventions (MUST follow - added 2026-06-17; updated 2026-06-27)
|
||||
|
||||
- **Test runner:** ALWAYS use `uv run python scripts/run_tests_batched.py` for test runs. NEVER call `uv run pytest` directly. The batched runner provides tier-based filtering, parallelization (xdist), and a summary table. Direct pytest is slow and bypasses the tiering that the live_gui tests depend on.
|
||||
- **NEVER filter test output** (added 2026-06-27 per user directive). Do NOT pipe test output through `Select-Object`, `| Select -First N`, `| Select -Last N`, `head`, `tail`, or any truncation filter. If you need to see more output later, you'll have to re-run the entire test — which wastes time and context. Instead, ALWAYS redirect to a log file: `uv run python scripts/run_tests_batched.py > tests/artifacts/tier2_state/<track>/test_run_<phase>_<task>.log 2>&1`. Then read the log file with `manual-slop_read_file` or `grep` to find the relevant sections. The log file is your full record; you can search it without re-running.
|
||||
- **Prefer targeted tier runs** (added 2026-06-27 per user directive). Do NOT run the full 11-tier batch for every verification. Run only the tiers relevant to the current task (e.g., `uv run python scripts/run_tests_batched.py --tier tier3` or `--filter test_<specific_file>`). The full batch is for the USER to run after merge review, not for Tier 2's per-task verification. Running the full batch every time wastes 20+ minutes and the output is too large to be useful in context.
|
||||
- **Default branch:** this repo uses `master` (not `main`). Always use `origin/master` in `git fetch` and as the base for new branches. Do not assume `main` exists.
|
||||
- **Line endings:** preserve existing line endings on edit. This repo has a mix of CRLF and LF (a repo-wide LF standardization is a future track). If the file is CRLF, keep it CRLF. If the file is LF, keep it LF. Do not add CRLF to LF files or strip CRLF from CRLF files.
|
||||
- **Throw-away scripts:** write them to `scripts/tier2/artifacts/<track-name>/`, NOT the base `scripts/tier2/` directory. The base directory is reserved for production code that ships with the sandbox (failcount.py, run_track.py, write_report.py, the .ps1 launchers). Throw-away scripts are kept for archival but live in a track-specific subdir so they don't pollute the base.
|
||||
|
||||
@@ -51,6 +51,8 @@ Optional flags: `--resume` (continue from last completed task), `--toast` (Windo
|
||||
## Conventions (MUST follow - added 2026-06-17)
|
||||
|
||||
- **Test runner:** use `uv run python scripts/run_tests_batched.py` (NOT `uv run pytest`)
|
||||
- **NEVER filter test output** (added 2026-06-27 per user directive). Do NOT pipe test output through `Select-Object`, `| Select -First N`, `| Select -Last N`, `head`, `tail`, or any truncation filter. Instead, ALWAYS redirect to a log file: `uv run python scripts/run_tests_batched.py > tests/artifacts/tier2_state/<track>/test_run_<phase>_<task>.log 2>&1`. Then read the log file to find relevant sections. The log file is your full record; you can search it without re-running.
|
||||
- **Prefer targeted tier runs** (added 2026-06-27 per user directive). Do NOT run the full 11-tier batch for every verification. Run only the tiers relevant to the current task (e.g., `--tier tier3` or `--filter test_<specific_file>`). The full batch is for the USER to run after merge review, not for Tier 2's per-task verification.
|
||||
- **Default branch:** `master` (this repo never had `main`)
|
||||
- **Line endings:** preserve existing (CRLF stays CRLF, LF stays LF)
|
||||
- **Throw-away scripts:** write to `scripts/tier2/artifacts/<track-name>/`, NOT the base directory
|
||||
|
||||
@@ -0,0 +1,108 @@
|
||||
{
|
||||
"track_id": "directive_hotswap_harness_20260627",
|
||||
"name": "Directive Hot-Swap Harness (OpenCode Directive Presets)",
|
||||
"status": "active",
|
||||
"branch": "master",
|
||||
"created": "2026-06-27",
|
||||
"owner": "Tier 1 (initialized); implementation delegated to Tier 2/3.",
|
||||
"blocked_by": [],
|
||||
"blocks": ["directive_encoding_experiments (future; alternative v2+ variant authoring)", "manual_slop_directive_lab (future; GUI integration)"],
|
||||
"scope": {
|
||||
"new_files": [
|
||||
"conductor/directives/<48 directive directories>/v1.md (48 files)",
|
||||
"conductor/directives/presets/current_baseline.md",
|
||||
"docs/reports/TRACK_COMPLETION_directive_hotswap_harness_20260627.md"
|
||||
],
|
||||
"modified_files": [
|
||||
".opencode/agents/tier1-orchestrator.md (replace hardcoded reading list with warm with:)",
|
||||
".opencode/agents/tier2-tech-lead.md (same)",
|
||||
".opencode/agents/tier3-worker.md (same)",
|
||||
".opencode/agents/tier4-qa.md (same)",
|
||||
"conductor/tier2/agents/tier2-autonomous.md (same)"
|
||||
],
|
||||
"deleted_files": []
|
||||
},
|
||||
"estimated_effort": {
|
||||
"method": "scope (per workflow.md Tier 1 Track Initialization Rules. NO day estimates.)",
|
||||
"phase_1": "10 steps: harvest 48 directives from doc tree into conductor/directives/ with exact source file:line refs",
|
||||
"phase_2": "8 steps: baseline preset + 5 role-prompt warm with: updates",
|
||||
"phase_3": "4 steps: verification + end-of-track report"
|
||||
},
|
||||
"verification_criteria": [
|
||||
"48 directive directories exist under conductor/directives/, each with a v1.md file",
|
||||
"Each v1.md has a header annotating the source location (file:line) and why this iteration exists",
|
||||
"conductor/directives/presets/current_baseline.md exists and lists all 48 directives",
|
||||
"All 5 tier role prompts have a 'warm with: conductor/directives/presets/current_baseline.md' line",
|
||||
"Non-directive reads (AGENTS.md, workflow.md, edit_workflow.md, forbidden-files.txt, guide_*.md) remain hardcoded in the role prompts",
|
||||
"Original docs are NOT modified (conductor/directives/ is a parallel structure)",
|
||||
"No scripts, no TOML, no build steps — markdown-only",
|
||||
"docs/reports/TRACK_COMPLETION_directive_hotswap_harness_20260627.md exists"
|
||||
],
|
||||
"regressions_and_pre_existing_failures": [],
|
||||
"pre_existing_failures_remaining": [],
|
||||
"deferred_to_followup_tracks": [
|
||||
{
|
||||
"title": "Alternative encoding authoring (v2+ variants)",
|
||||
"description": "Author v2_rationale_first.md, v3_before_after.md, v4_tabular.md etc. per directive. The actual experimentation.",
|
||||
"track_status": "not yet initialized"
|
||||
},
|
||||
{
|
||||
"title": "Manual Slop Directive Lab (GUI integration)",
|
||||
"description": "A Directive Lab panel in Manual Slop for virtualized directive selection + context aggregation.",
|
||||
"track_status": "not yet initialized"
|
||||
},
|
||||
{
|
||||
"title": "Token-cost analysis tooling",
|
||||
"description": "Measure token cost per directive variant. Compare compliance vs token cost.",
|
||||
"track_status": "not yet initialized"
|
||||
},
|
||||
{
|
||||
"title": "Automated compliance testing",
|
||||
"description": "Test harness to measure LLM compliance per encoding (does the LLM follow the directive?).",
|
||||
"track_status": "not yet initialized"
|
||||
},
|
||||
{
|
||||
"title": "Video Analysis Campaign 2 (4 new videos)",
|
||||
"description": "Separate campaign; follows the 3-pass pattern. May inform alternative encoding strategies.",
|
||||
"track_status": "not yet initialized; separate track"
|
||||
}
|
||||
],
|
||||
"risk_register": [
|
||||
{
|
||||
"id": "R1",
|
||||
"description": "Harvest completeness: directives embedded in prose may be missed",
|
||||
"likelihood": "medium",
|
||||
"impact": "the baseline preset is incomplete; some directives are not swappable",
|
||||
"mitigation": "systematic combing of the entire doc tree with grep; the plan's Step 1.1-1.10 cover every doc file identified in the spec's source list"
|
||||
},
|
||||
{
|
||||
"id": "R2",
|
||||
"description": "Granularity ambiguity: some directives overlap (e.g., ban_dict_any + typed_dataclass_fields are two sides of the same coin)",
|
||||
"likelihood": "medium",
|
||||
"impact": "the directive count is inflated by overlapping directives; preset becomes verbose",
|
||||
"mitigation": "the 48-directive list is the initial best-guess; granularity is resolved iteratively as the user experiments. Merging directives is a future preset edit, not a blocker."
|
||||
},
|
||||
{
|
||||
"id": "R3",
|
||||
"description": "LLM doesn't follow the warm with: instruction reliably",
|
||||
"likelihood": "low",
|
||||
"impact": "the LLM doesn't read the preset or the variant files; directives are missing from context",
|
||||
"mitigation": "the instruction is simple (read a file, read the files it lists) and uses the existing file-reading behavior. The Step 3.2 manual verification catches this."
|
||||
},
|
||||
{
|
||||
"id": "R4",
|
||||
"description": "Role-prompt update breaks existing Tier 2 autonomous runs",
|
||||
"likelihood": "low",
|
||||
"impact": "Tier 2 starts reading a different set of files; behavior changes",
|
||||
"mitigation": "the current_baseline preset lists the exact same directives that were hardcoded. The change is structural (where the list lives), not semantic (what the directives say)."
|
||||
}
|
||||
],
|
||||
"campaign_context": {
|
||||
"campaign_name": "Directive Encoding Campaign (Campaign A)",
|
||||
"track_1": "directive_hotswap_harness_20260627 (THIS; harvest + scaffold + baseline preset + role-prompt bootstrap)",
|
||||
"track_2": "directive_encoding_experiments (future; v2+ variant authoring + preset experimentation)",
|
||||
"track_3": "manual_slop_directive_lab (future; GUI integration)",
|
||||
"sibling_campaign": "Video Analysis Campaign 2 (Campaign B; 4 new videos; separate track)",
|
||||
"cross_campaign_relationship": "Intellectual cross-pollination; no hard dependency. Video insights may surface alternative encoding strategies. The harness design mirrors the video campaign's deobfuscation pattern (same content, different encoding)."
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,490 @@
|
||||
# Directive Hot-Swap Harness Implementation Plan
|
||||
|
||||
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
|
||||
|
||||
**Goal:** Build a directive hot-swap harness that lets the user maintain alternative encodings of the same directive as separate files, compose them into named presets (markdown bills of materials), and hot-swap which preset is active via a single `warm with: <path>` instruction in the role prompt or session message.
|
||||
|
||||
**Architecture:** A `conductor/directives/` directory tree where each directive is a subdirectory and each encoding variant is a file (`v1.md`, `v2_<style>.md`). Presets in `conductor/directives/presets/` are markdown files listing which variant files to read. The 5 tier role prompts are updated with a single `warm with: <preset_path>` line that replaces the hardcoded mandatory-reading list. No scripts, no TOML, no build steps — markdown-only, LLM-native.
|
||||
|
||||
**Tech Stack:** Markdown files. No code changes. No tests (this is a documentation/tooling track, not a code track). The "test" is: does an LLM follow the `warm with:` instruction and read the listed files?
|
||||
|
||||
**Spec:** `docs/superpowers/specs/2026-06-27-directive-hotswap-harness-design.md`
|
||||
|
||||
---
|
||||
|
||||
## File Structure
|
||||
|
||||
### New files (created by this plan)
|
||||
|
||||
```
|
||||
conductor/directives/
|
||||
ban_dict_any/v1.md
|
||||
ban_any_type/v1.md
|
||||
ban_optional_returns/v1.md
|
||||
ban_hasattr_dispatch/v1.md
|
||||
ban_getattr_dispatch/v1.md
|
||||
ban_dict_get_on_known_fields/v1.md
|
||||
ban_local_imports/v1.md
|
||||
ban_prefix_aliasing/v1.md
|
||||
ban_repeated_from_dict/v1.md
|
||||
boundary_layer_exception/v1.md
|
||||
result_error_pattern/v1.md
|
||||
nil_sentinel_pattern/v1.md
|
||||
typed_dataclass_fields/v1.md
|
||||
metadata_boundary_type/v1.md
|
||||
one_space_indent/v1.md
|
||||
no_comments_in_body/v1.md
|
||||
no_diagnostic_noise/v1.md
|
||||
type_hints_required/v1.md
|
||||
sdm_dependency_tags/v1.md
|
||||
file_naming_convention/v1.md
|
||||
no_new_src_files_without_permission/v1.md
|
||||
large_files_are_fine/v1.md
|
||||
atomic_per_task_commits/v1.md
|
||||
tdd_red_green_required/v1.md
|
||||
ban_arbitrary_core_mocking/v1.md
|
||||
live_gui_poll_not_sleep/v1.md
|
||||
batch_verification_not_isolation/v1.md
|
||||
git_hard_bans/v1.md
|
||||
ban_day_estimates/v1.md
|
||||
no_output_filtering/v1.md
|
||||
prefer_targeted_tier_runs/v1.md
|
||||
mandatory_research_first/v1.md
|
||||
no_skip_markers_as_avoidance/v1.md
|
||||
deduction_loop_limit/v1.md
|
||||
report_instead_of_fix_ban/v1.md
|
||||
scope_creep_track_doc_ban/v1.md
|
||||
inherited_cruft_ask_first/v1.md
|
||||
verbose_commit_message_ban/v1.md
|
||||
imgui_scope_verification/v1.md
|
||||
modular_controller_pattern/v1.md
|
||||
ui_delegation_for_hot_reload/v1.md
|
||||
strict_state_management/v1.md
|
||||
comprehensive_logging/v1.md
|
||||
feature_flag_delete_to_turn_off/v1.md
|
||||
rag_six_rules/v1.md
|
||||
cache_stable_to_volatile/v1.md
|
||||
knowledge_harvest_pattern/v1.md
|
||||
|
||||
presets/
|
||||
current_baseline.md
|
||||
```
|
||||
|
||||
### Modified files
|
||||
|
||||
```
|
||||
.opencode/agents/tier1-orchestrator.md (replace mandatory-reading list with warm with:)
|
||||
.opencode/agents/tier2-tech-lead.md (same)
|
||||
.opencode/agents/tier3-worker.md (same)
|
||||
.opencode/agents/tier4-qa.md (same)
|
||||
conductor/tier2/agents/tier2-autonomous.md (same)
|
||||
```
|
||||
|
||||
### NOT modified (the original docs stay untouched)
|
||||
|
||||
```
|
||||
AGENTS.md (stays as canonical source)
|
||||
conductor/workflow.md (stays as canonical source)
|
||||
conductor/product-guidelines.md (stays as canonical source)
|
||||
conductor/code_styleguides/*.md (all stay as canonical source)
|
||||
docs/*.md (all stay as canonical source)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Phase 1: Directive Harvest
|
||||
|
||||
Focus: Systematically comb the doc tree, extract every directive-like statement into a candidate list, resolve granularity (which to merge, split, keep standalone). This is the bulk of the work.
|
||||
|
||||
Each task creates one or more `conductor/directives/<name>/v1.md` files. The v1 content is a verbatim lift from the source doc (not a rewrite). The variant header annotates the source location and why this iteration exists.
|
||||
|
||||
- [ ] **Step 1.1: Harvest §17 banned patterns (7 directives)**
|
||||
|
||||
**Files to read:**
|
||||
- `conductor/code_styleguides/python.md:216-409` (§17 Banned Patterns — the 7 banned patterns + §17.7 boundary exception + §17.8 enforcement + §17.9 local imports + §17.10 enforcement inventory)
|
||||
|
||||
**Directives to create:**
|
||||
|
||||
1. `conductor/directives/ban_dict_any/v1.md` — source: `python.md:220-237` (§17.1). Content: the `dict[str, Any]` ban + before/after examples + the boundary exception cross-ref.
|
||||
2. `conductor/directives/ban_any_type/v1.md` — source: `python.md:239-250` (§17.2). Content: the `Any` ban + before/after.
|
||||
3. `conductor/directives/ban_optional_returns/v1.md` — source: `python.md:252-272` (§17.3). Content: the `Optional[T]` return ban + the `Result[T]` replacement pattern.
|
||||
4. `conductor/directives/ban_hasattr_dispatch/v1.md` — source: `python.md:274-299` (§17.4). Content: the `hasattr()` for entity type dispatch ban + the typed Union alternative.
|
||||
5. `conductor/directives/ban_getattr_dispatch/v1.md` — source: `python.md:301-311` (§17.5). Content: the `getattr(x, 'field', default)` for type dispatch ban.
|
||||
6. `conductor/directives/ban_dict_get_on_known_fields/v1.md` — source: `python.md:313-323` (§17.6). Content: the `.get('field', default)` on a `dict[str, Any]` ban + direct attribute access alternative.
|
||||
7. `conductor/directives/boundary_layer_exception/v1.md` — source: `python.md:325-327` (§17.7). Content: the ONE exception — the wire boundary (TOML/JSON parse) where `dict[str, Any]` is allowed.
|
||||
|
||||
**Variant header format** (use for ALL v1 files):
|
||||
```markdown
|
||||
# <directive_name> — v1
|
||||
|
||||
**Why this iteration:** Lifted verbatim from `conductor/code_styleguides/python.md` §17.N (lines N-M).
|
||||
This is the baseline encoding — the style currently in production. Future variants
|
||||
will test alternative encodings (rationale-first, before/after, tabular) against this baseline.
|
||||
|
||||
**Source:** `conductor/code_styleguides/python.md:NNN-MMM`
|
||||
|
||||
---
|
||||
|
||||
<verbatim directive text from the source>
|
||||
```
|
||||
|
||||
- [ ] **Step 1.2: Harvest §17.9 import/aliasing bans (3 directives)**
|
||||
|
||||
**Files to read:**
|
||||
- `conductor/code_styleguides/python.md:336-409` (§17.9 local imports + aliasing + repeated from_dict)
|
||||
|
||||
**Directives to create:**
|
||||
|
||||
8. `conductor/directives/ban_local_imports/v1.md` — source: `python.md:336-360` (§17.9a). Content: local imports inside functions are banned + the `try/except ImportError` exception + the vendor-SDK-warmup whitelist.
|
||||
9. `conductor/directives/ban_prefix_aliasing/v1.md` — source: `python.md` (§17.9b, within the 336-409 range). Content: `import X as _X` aliasing-for-naming-convenience is banned.
|
||||
10. `conductor/directives/ban_repeated_from_dict/v1.md` — source: `python.md` (§17.9c, within the 336-409 range). Content: repeated `.from_dict()` calls in the same expression are banned.
|
||||
|
||||
- [ ] **Step 1.3: Harvest error handling conventions (2 directives)**
|
||||
|
||||
**Files to read:**
|
||||
- `conductor/code_styleguides/error_handling.md:22-56` (the 5 patterns) + `error_handling.md:212-242` (hard rules) + `error_handling.md:274-311` (boundary types)
|
||||
|
||||
**Directives to create:**
|
||||
|
||||
11. `conductor/directives/result_error_pattern/v1.md` — source: `error_handling.md:22-56, 212-242`. Content: the `Result[T]` dataclass pattern (data + errors list, not `Optional[T]` + exceptions). The 5 patterns (nil-sentinel, zero-init, fail-early, AND over OR, error-info as side-channel). The hard rules (`Optional[T]` returns forbidden in baseline files; `Result[T]` for any function that can fail).
|
||||
12. `conductor/directives/nil_sentinel_pattern/v1.md` — source: `error_handling.md:24-47` (Pattern 1 — Nil-Sentinel Dataclasses). Content: the `NIL_T` singleton pattern replacing `None`. The sentinel type contract.
|
||||
|
||||
- [ ] **Step 1.4: Harvest type/data-structure conventions (3 directives)**
|
||||
|
||||
**Files to read:**
|
||||
- `conductor/code_styleguides/data_oriented_design.md:176-215` (§8.5 Python Type Promotion Mandate + §8.6 Boundary Layer + §8.7 C11 framing)
|
||||
- `conductor/code_styleguides/type_aliases.md:40-81` (Metadata boundary type + when to promote + when NOT to promote)
|
||||
|
||||
**Directives to create:**
|
||||
|
||||
13. `conductor/directives/typed_dataclass_fields/v1.md` — source: `data_oriented_design.md:176-199` (§8.5). Content: the Python Type Promotion Mandate — use typed `@dataclass(frozen=True, slots=True)` with explicit fields. The 7 banned patterns table.
|
||||
14. `conductor/directives/metadata_boundary_type/v1.md` — source: `type_aliases.md:40-81` + `data_oriented_design.md:200-215` (§8.6). Content: `Metadata` is the typed fat struct at the wire boundary, NOT `TypeAlias = dict[str, Any]`. The boundary is 2-3 functions per file. When to promote to per-aggregate dataclass vs. when to keep as collapsed codepath.
|
||||
15. `conductor/directives/boundary_layer_exception/v1.md` — UPDATE the file created in Step 1.1 to also include the `data_oriented_design.md:200-215` (§8.6) and `type_aliases.md` boundary-layer content. This directive cross-references §17.7 (the exception) + §8.6 (the boundary definition) + type_aliases.md (the Metadata-as-boundary-type rule).
|
||||
|
||||
- [ ] **Step 1.5: Harvest code style directives (5 directives)**
|
||||
|
||||
**Files to read:**
|
||||
- `conductor/code_styleguides/python.md:7-21` (§1 Indentation + §2 Type Annotations)
|
||||
- `conductor/code_styleguides/python.md:64-71` (§8 AI-Agent Specific Conventions — no comments, no diagnostic noise)
|
||||
- `conductor/code_styleguides/python.md:185-199` (§13 Vertical Compaction)
|
||||
- `conductor/code_styleguides/python.md:175-184` (§12 SDM)
|
||||
- `conductor/workflow.md:5-20` (Code Style section)
|
||||
|
||||
**Directives to create:**
|
||||
|
||||
16. `conductor/directives/one_space_indent/v1.md` — source: `python.md:7-20` + `workflow.md:7`. Content: 1-space indentation for ALL Python code. CRLF line endings on Windows. No comments unless explicitly requested.
|
||||
17. `conductor/directives/no_comments_in_body/v1.md` — source: `python.md:66` + `AGENTS.md:56`. Content: no comments in source code; documentation lives in `/docs`. Only comment on *why* when non-obvious.
|
||||
18. `conductor/directives/no_diagnostic_noise/v1.md` — source: `python.md:70` + `AGENTS.md` "No Diagnostic Noise in Production" section. Content: no `sys.stderr.write("[XYZ_DIAG] ...")` in production code. Diag goes to log files or temp scripts.
|
||||
19. `conductor/directives/type_hints_required/v1.md` — source: `python.md:24-31` + `product-guidelines.md:58`. Content: mandatory strict type hints for all parameters, return types, and global variables.
|
||||
20. `conductor/directives/sdm_dependency_tags/v1.md` — source: `python.md:175-184` (§12) + `product-guidelines.md:59`. Content: Structural Dependency Mapping tags (`[C: ...]`, `[M: ...]`, `[U: ...]`) in docstrings for AI-assisted impact analysis.
|
||||
|
||||
- [ ] **Step 1.6: Harvest file/taxonomy conventions (3 directives)**
|
||||
|
||||
**Files to read:**
|
||||
- `AGENTS.md:62-76` (File Size and Naming Convention HARD RULE)
|
||||
- `conductor/workflow.md:45` (File Naming Convention HARD RULE)
|
||||
- `conductor/code_styleguides/python.md:205-215` (§15 Modular Controller Pattern)
|
||||
|
||||
**Directives to create:**
|
||||
|
||||
21. `conductor/directives/file_naming_convention/v1.md` — source: `AGENTS.md:62-76` + `workflow.md:45`. Content: new `src/<thing>.py` files may only be created on the user's explicit request. Helpers go in the parent module. Large files are FINE.
|
||||
22. `conductor/directives/no_new_src_files_without_permission/v1.md` — source: `AGENTS.md:68-76`. Content: the audit trigger — "is `<thing>` a new system, or is it part of an existing system?" If it's part of an existing system, the file goes in that system's file.
|
||||
23. `conductor/directives/large_files_are_fine/v1.md` — source: `AGENTS.md:62-67`. Content: large files are FINE. The "small files are good" stance is propaganda from LLM training data. Cognitive load is managed via naming, regions, and navigation tools — NOT via file splitting.
|
||||
|
||||
- [ ] **Step 1.7: Harvest process/workflow directives (10 directives)**
|
||||
|
||||
**Files to read:**
|
||||
- `conductor/workflow.md:80-120` (Standard Task Workflow — TDD, atomic commits, delegate)
|
||||
- `conductor/workflow.md:112-170` (Phase Completion Verification + API Hooks verification)
|
||||
- `conductor/workflow.md:262-280` (Structural Testing Contract)
|
||||
- `AGENTS.md:49-85` (Critical Anti-Patterns)
|
||||
- `AGENTS.md:86-118` (Session-Learned Anti-Patterns)
|
||||
- `AGENTS.md:119-185` (Process Anti-Patterns)
|
||||
- `conductor/workflow.md:385-391` (Tier 2 conventions — the 2 new rules)
|
||||
|
||||
**Directives to create:**
|
||||
|
||||
24. `conductor/directives/atomic_per_task_commits/v1.md` — source: `workflow.md:112` + `AGENTS.md:55`. Content: commit per-task for atomic rollback. Do NOT batch commits.
|
||||
25. `conductor/directives/tdd_red_green_required/v1.md` — source: `workflow.md:78-100` (Standard Task Workflow steps 4-6). Content: write failing tests before implementing. Run tests, confirm they fail (Red). Implement, run, confirm pass (Green). The Zero-Assertion Ban (tests must have meaningful assertions).
|
||||
26. `conductor/directives/ban_arbitrary_core_mocking/v1.md` — source: `workflow.md:262`. Content: ban on `unittest.mock.patch` to bypass core infrastructure unless explicitly authorized.
|
||||
27. `conductor/directives/live_gui_poll_not_sleep/v1.md` — source: `workflow.md:465-475` (Anti-Pattern: push_event + time.sleep + assert). Content: replace `time.sleep(N)` with a poll loop on `get_value` or `wait_for_event`.
|
||||
28. `conductor/directives/batch_verification_not_isolation/v1.md` — source: `workflow.md:510-514` (Isolated-Pass Verification Fallacy). Content: the only verification that matters for `live_gui` tests is the batch run. Do NOT commit a fix verified only in isolation.
|
||||
29. `conductor/directives/git_hard_bans/v1.md` — source: `AGENTS.md:59` + `workflow.md:417-430`. Content: `git restore`, `git checkout -- <file>`, `git reset` are FORBIDDEN without explicit user permission. Use `git show` for inspection, not `git checkout`.
|
||||
30. `conductor/directives/ban_day_estimates/v1.md` — source: `AGENTS.md:60`. Content: no day/hour/minute estimates in track artifacts. Measure effort by scope (N files, M sites, N tasks).
|
||||
31. `conductor/directives/no_output_filtering/v1.md` — source: `workflow.md:386`. Content: NEVER filter test output through `Select-Object`, `head`, `tail`. Always redirect to a log file.
|
||||
32. `conductor/directives/prefer_targeted_tier_runs/v1.md` — source: `workflow.md:387`. Content: do NOT run the full 11-tier batch for every verification. Run targeted tiers.
|
||||
33. `conductor/directives/mandatory_research_first/v1.md` — source: `workflow.md:46`. Content: before reading any file >50 lines, use `get_file_summary`/`py_get_skeleton`/`py_get_code_outline` to map the structure first.
|
||||
|
||||
- [ ] **Step 1.8: Harvest process anti-patterns (6 directives)**
|
||||
|
||||
**Files to read:**
|
||||
- `AGENTS.md:119-185` (Process Anti-Patterns — the 8 named patterns)
|
||||
- `conductor/workflow.md` "Skip-Marker Policy" section
|
||||
|
||||
**Directives to create:**
|
||||
|
||||
34. `conductor/directives/no_skip_markers_as_avoidance/v1.md` — source: `workflow.md` "Skip-Marker Policy" + `AGENTS.md:54`. Content: `@pytest.mark.skip` is documentation of a known failure, not an escape from fixing the bug. Fix in-session when feasible.
|
||||
35. `conductor/directives/deduction_loop_limit/v1.md` — source: `AGENTS.md:127` (Process Anti-Pattern #1). Content: at most 2 test runs in a single investigation. After the 2nd failure, STOP and read the code.
|
||||
36. `conductor/directives/report_instead_of_fix_ban/v1.md` — source: `AGENTS.md:134` (Process Anti-Pattern #2). Content: a 200-line status report is a confession, not a fix. A good status report is 5-10 sentences.
|
||||
37. `conductor/directives/scope_creep_track_doc_ban/v1.md` — source: `AGENTS.md:143` (Process Anti-Pattern #3). Content: if the user asks for a fix, your output is the fix. A track doc is only for multi-day work.
|
||||
38. `conductor/directives/inherited_cruft_ask_first/v1.md` — source: `AGENTS.md:149` (Process Anti-Pattern #4). Content: if a file is broken from a previous session, ASK the user before trying to fix it.
|
||||
39. `conductor/directives/verbose_commit_message_ban/v1.md` — source: `AGENTS.md:176` (Process Anti-Pattern #7). Content: a commit message is 1-3 sentences. If it's longer than 15 lines, it's a report.
|
||||
|
||||
- [ ] **Step 1.9: Harvest GUI/architecture directives (5 directives)**
|
||||
|
||||
**Files to read:**
|
||||
- `conductor/product-guidelines.md:29-43` (UX & UI Principles + Code Standards)
|
||||
- `conductor/workflow.md:39` (ImGui Verification)
|
||||
|
||||
**Directives to create:**
|
||||
|
||||
40. `conductor/directives/imgui_scope_verification/v1.md` — source: `product-guidelines.md:39` + `workflow.md:39`. Content: all changes to `gui_2.py` MUST be verified using `scripts/check_imgui_scopes.py`. Use `imscope` context managers over manual push/pop.
|
||||
41. `conductor/directives/modular_controller_pattern/v1.md` — source: `product-guidelines.md:40`. Content: state-independent logic must be moved to module-level functions. Massive `if/elif` dispatch blocks must be refactored into handler maps.
|
||||
42. `conductor/directives/ui_delegation_for_hot_reload/v1.md` — source: `product-guidelines.md:41`. Content: all complex ImGui rendering logic must be extracted from the `App` class into module-level `render_xxx(app)` functions. The `App` class should only contain thin delegation wrappers.
|
||||
43. `conductor/directives/strict_state_management/v1.md` — source: `product-guidelines.md:37`. Content: rigorous separation between the Main GUI rendering thread and daemon execution threads. The UI should NEVER hang during AI communication. Use lock-protected queues and events.
|
||||
44. `conductor/directives/comprehensive_logging/v1.md` — source: `product-guidelines.md:38`. Content: aggressively log all actions, API payloads, tool calls, and executed scripts. Maintain timestamped JSON-L and markdown logs.
|
||||
|
||||
- [ ] **Step 1.10: Harvest feature-flag + RAG + cache + knowledge directives (4 directives)**
|
||||
|
||||
**Files to read:**
|
||||
- `conductor/code_styleguides/feature_flags.md`
|
||||
- `conductor/code_styleguides/rag_integration_discipline.md:11-20` (the 6 rules)
|
||||
- `conductor/code_styleguides/cache_friendly_context.md:52-74` (the byte-comparison test)
|
||||
- `conductor/code_styleguides/knowledge_artifacts.md`
|
||||
|
||||
**Directives to create:**
|
||||
|
||||
45. `conductor/directives/feature_flag_delete_to_turn_off/v1.md` — source: `feature_flags.md`. Content: file presence ("delete to turn off") for side artifacts; config flags for persistent preferences; CLI flags for one-shot overrides.
|
||||
46. `conductor/directives/rag_six_rules/v1.md` — source: `rag_integration_discipline.md:11-20`. Content: the 6 rules (opt-in, complements, provenance, no mutation, feature-gated, graceful failure).
|
||||
47. `conductor/directives/cache_stable_to_volatile/v1.md` — source: `cache_friendly_context.md:52-74`. Content: stable-to-volatile context ordering. The byte-comparison test. Layers 1-7 cacheable, 8-12 not.
|
||||
48. `conductor/directives/knowledge_harvest_pattern/v1.md` — source: `knowledge_artifacts.md`. Content: the category files + provenance + sha256 ledger + digest regeneration pattern.
|
||||
|
||||
- [ ] **Step 1.11: Commit the directive harvest**
|
||||
|
||||
```bash
|
||||
git add conductor/directives/
|
||||
git commit -m "feat(directives): harvest 48 directives from doc tree into conductor/directives/
|
||||
|
||||
Systematic extraction of every directive-like statement (imperative,
|
||||
preference, hard ban, convention, anti-pattern) from the entire doc tree
|
||||
into conductor/directives/<name>/v1.md files. Each v1 is a verbatim lift
|
||||
from the source doc with a header annotating the source location.
|
||||
|
||||
Sources combed: AGENTS.md, conductor/workflow.md, conductor/product-guidelines.md,
|
||||
conductor/tech-stack.md, all 10 conductor/code_styleguides/*.md, docs/AGENTS.md.
|
||||
|
||||
Original docs remain untouched as canonical source. The conductor/directives/
|
||||
tree is a parallel structure, not a replacement."
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Phase 2: Baseline Preset + Role-Prompt Bootstrap
|
||||
|
||||
Focus: Create the `current_baseline.md` preset that lists all 48 directives, then update the 5 role prompts with the `warm with:` bootstrap.
|
||||
|
||||
- [ ] **Step 2.1: Create the baseline preset**
|
||||
|
||||
**File:** `conductor/directives/presets/current_baseline.md`
|
||||
|
||||
**Content:**
|
||||
|
||||
```markdown
|
||||
# Preset: current_baseline
|
||||
|
||||
The baseline directive composition — all v1 variants lifted verbatim from the
|
||||
current production docs. This is the starting point; alternative presets swap
|
||||
variants to test different encodings.
|
||||
|
||||
## Directives to warm
|
||||
|
||||
Read each file below before any action.
|
||||
|
||||
- ban_dict_any: conductor/directives/ban_dict_any/v1.md
|
||||
- ban_any_type: conductor/directives/ban_any_type/v1.md
|
||||
- ban_optional_returns: conductor/directives/ban_optional_returns/v1.md
|
||||
- ban_hasattr_dispatch: conductor/directives/ban_hasattr_dispatch/v1.md
|
||||
- ban_getattr_dispatch: conductor/directives/ban_getattr_dispatch/v1.md
|
||||
- ban_dict_get_on_known_fields: conductor/directives/ban_dict_get_on_known_fields/v1.md
|
||||
- boundary_layer_exception: conductor/directives/boundary_layer_exception/v1.md
|
||||
- ban_local_imports: conductor/directives/ban_local_imports/v1.md
|
||||
- ban_prefix_aliasing: conductor/directives/ban_prefix_aliasing/v1.md
|
||||
- ban_repeated_from_dict: conductor/directives/ban_repeated_from_dict/v1.md
|
||||
- result_error_pattern: conductor/directives/result_error_pattern/v1.md
|
||||
- nil_sentinel_pattern: conductor/directives/nil_sentinel_pattern/v1.md
|
||||
- typed_dataclass_fields: conductor/directives/typed_dataclass_fields/v1.md
|
||||
- metadata_boundary_type: conductor/directives/metadata_boundary_type/v1.md
|
||||
- one_space_indent: conductor/directives/one_space_indent/v1.md
|
||||
- no_comments_in_body: conductor/directives/no_comments_in_body/v1.md
|
||||
- no_diagnostic_noise: conductor/directives/no_diagnostic_noise/v1.md
|
||||
- type_hints_required: conductor/directives/type_hints_required/v1.md
|
||||
- sdm_dependency_tags: conductor/directives/sdm_dependency_tags/v1.md
|
||||
- file_naming_convention: conductor/directives/file_naming_convention/v1.md
|
||||
- no_new_src_files_without_permission: conductor/directives/no_new_src_files_without_permission/v1.md
|
||||
- large_files_are_fine: conductor/directives/large_files_are_fine/v1.md
|
||||
- atomic_per_task_commits: conductor/directives/atomic_per_task_commits/v1.md
|
||||
- tdd_red_green_required: conductor/directives/tdd_red_green_required/v1.md
|
||||
- ban_arbitrary_core_mocking: conductor/directives/ban_arbitrary_core_mocking/v1.md
|
||||
- live_gui_poll_not_sleep: conductor/directives/live_gui_poll_not_sleep/v1.md
|
||||
- batch_verification_not_isolation: conductor/directives/batch_verification_not_isolation/v1.md
|
||||
- git_hard_bans: conductor/directives/git_hard_bans/v1.md
|
||||
- ban_day_estimates: conductor/directives/ban_day_estimates/v1.md
|
||||
- no_output_filtering: conductor/directives/no_output_filtering/v1.md
|
||||
- prefer_targeted_tier_runs: conductor/directives/prefer_targeted_tier_runs/v1.md
|
||||
- mandatory_research_first: conductor/directives/mandatory_research_first/v1.md
|
||||
- no_skip_markers_as_avoidance: conductor/directives/no_skip_markers_as_avoidance/v1.md
|
||||
- deduction_loop_limit: conductor/directives/deduction_loop_limit/v1.md
|
||||
- report_instead_of_fix_ban: conductor/directives/report_instead_of_fix_ban/v1.md
|
||||
- scope_creep_track_doc_ban: conductor/directives/scope_creep_track_doc_ban/v1.md
|
||||
- inherited_cruft_ask_first: conductor/directives/inherited_cruft_ask_first/v1.md
|
||||
- verbose_commit_message_ban: conductor/directives/verbose_commit_message_ban/v1.md
|
||||
- imgui_scope_verification: conductor/directives/imgui_scope_verification/v1.md
|
||||
- modular_controller_pattern: conductor/directives/modular_controller_pattern/v1.md
|
||||
- ui_delegation_for_hot_reload: conductor/directives/ui_delegation_for_hot_reload/v1.md
|
||||
- strict_state_management: conductor/directives/strict_state_management/v1.md
|
||||
- comprehensive_logging: conductor/directives/comprehensive_logging/v1.md
|
||||
- feature_flag_delete_to_turn_off: conductor/directives/feature_flag_delete_to_turn_off/v1.md
|
||||
- rag_six_rules: conductor/directives/rag_six_rules/v1.md
|
||||
- cache_stable_to_volatile: conductor/directives/cache_stable_to_volatile/v1.md
|
||||
- knowledge_harvest_pattern: conductor/directives/knowledge_harvest_pattern/v1.md
|
||||
|
||||
## Notes
|
||||
|
||||
All v1 (verbatim lifts from current production docs). No alternative encodings
|
||||
tested yet. This preset is the control group for future experiments.
|
||||
|
||||
To create an experimental preset: copy this file, change the variant path for
|
||||
the directives you want to test (e.g., swap `v1.md` for `v2_rationale_first.md`),
|
||||
and update the Notes section with your hypothesis.
|
||||
```
|
||||
|
||||
- [ ] **Step 2.2: Commit the preset**
|
||||
|
||||
```bash
|
||||
git add conductor/directives/presets/current_baseline.md
|
||||
git commit -m "feat(directives): add current_baseline preset (48 directives, all v1)"
|
||||
```
|
||||
|
||||
- [ ] **Step 2.3: Update tier1-orchestrator.md with warm with: bootstrap**
|
||||
|
||||
**File:** `.opencode/agents/tier1-orchestrator.md`
|
||||
|
||||
**What to change:** Find the "MANDATORY: Pre-Action Required Reading" section (or equivalent hardcoded file list). Replace the directive-reading portion with:
|
||||
|
||||
```markdown
|
||||
## MANDATORY: Directive Warm-up
|
||||
|
||||
warm with: conductor/directives/presets/current_baseline.md
|
||||
|
||||
Read the preset file above. It lists directive variant files to read before any action.
|
||||
Read each file the preset references. These are your active directives for this session.
|
||||
|
||||
If the user specifies a different preset (e.g., "warm with: conductor/directives/presets/exploratory_rationale.md"),
|
||||
use that instead. The user's instruction overrides the default.
|
||||
```
|
||||
|
||||
**What stays (non-directive reads that remain hardcoded):**
|
||||
- `AGENTS.md` — project operating rules
|
||||
- `conductor/workflow.md` — operational workflow
|
||||
- `conductor/edit_workflow.md` — edit tool contract
|
||||
- The relevant `docs/guide_*.md` — architecture reference
|
||||
|
||||
- [ ] **Step 2.4: Update tier2-tech-lead.md with warm with: bootstrap**
|
||||
|
||||
**File:** `.opencode/agents/tier2-tech-lead.md`
|
||||
|
||||
Same change as Step 2.3. The non-directive reads that stay hardcoded:
|
||||
- `AGENTS.md`
|
||||
- `conductor/workflow.md`
|
||||
- `conductor/edit_workflow.md`
|
||||
- `conductor/tier2/githooks/forbidden-files.txt`
|
||||
- The relevant `docs/guide_*.md`
|
||||
|
||||
- [ ] **Step 2.5: Update tier3-worker.md with warm with: bootstrap**
|
||||
|
||||
**File:** `.opencode/agents/tier3-worker.md`
|
||||
|
||||
Same change. Note: Tier 3 may benefit from a reduced preset (fewer directives — they don't need the planning/strategy directives). But for now, use `current_baseline.md` and let the user create a `worker_minimal.md` preset later.
|
||||
|
||||
- [ ] **Step 2.6: Update tier4-qa.md with warm with: bootstrap**
|
||||
|
||||
**File:** `.opencode/agents/tier4-qa.md`
|
||||
|
||||
Same change. Tier 4 reads narrowly; the preset can be customized later.
|
||||
|
||||
- [ ] **Step 2.7: Update tier2-autonomous.md with warm with: bootstrap**
|
||||
|
||||
**File:** `conductor/tier2/agents/tier2-autonomous.md`
|
||||
|
||||
This file has the most extensive hardcoded reading list (11 files, lines 32-52). Replace the directive-reading portion with the `warm with:` bootstrap. The non-directive reads that stay:
|
||||
- `AGENTS.md`
|
||||
- `conductor/workflow.md`
|
||||
- `conductor/edit_workflow.md`
|
||||
- `conductor/tier2/githooks/forbidden-files.txt`
|
||||
- `conductor/tracks/tier2_leak_prevention_20260620/spec.md` (this is a track spec, not a directive — stays hardcoded)
|
||||
|
||||
- [ ] **Step 2.8: Commit the role-prompt updates**
|
||||
|
||||
```bash
|
||||
git add .opencode/agents/tier1-orchestrator.md .opencode/agents/tier2-tech-lead.md .opencode/agents/tier3-worker.md .opencode/agents/tier4-qa.md conductor/tier2/agents/tier2-autonomous.md
|
||||
git commit -m "feat(role-prompts): replace hardcoded directive lists with warm with: bootstrap
|
||||
|
||||
All 5 tier role prompts now use 'warm with: conductor/directives/presets/current_baseline.md'
|
||||
instead of a hardcoded list of ~11 files. The LLM reads the preset, then reads
|
||||
the variant files it lists. Non-directive reads (AGENTS.md, workflow.md,
|
||||
edit_workflow.md, forbidden-files.txt, guide_*.md) remain hardcoded.
|
||||
|
||||
The user can override the preset per-session by saying 'warm with: <path>' in
|
||||
their session message. This is the hot-swap mechanism."
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Phase 3: Verification + End-of-Track
|
||||
|
||||
- [ ] **Step 3.1: Verify the directory structure**
|
||||
|
||||
```bash
|
||||
# Count directive directories
|
||||
ls conductor/directives/ | wc -l
|
||||
|
||||
# Count v1.md files
|
||||
find conductor/directives/ -name "v1.md" | wc -l
|
||||
|
||||
# Verify preset exists
|
||||
test -f conductor/directives/presets/current_baseline.md
|
||||
|
||||
# Verify all 5 role prompts have the warm with: line
|
||||
grep -l "warm with:" .opencode/agents/tier1-orchestrator.md .opencode/agents/tier2-tech-lead.md .opencode/agents/tier3-worker.md .opencode/agents/tier4-qa.md conductor/tier2/agents/tier2-autonomous.md
|
||||
```
|
||||
|
||||
Expected: 48 directive directories, 48 v1.md files, preset exists, 5 role prompts have `warm with:`.
|
||||
|
||||
- [ ] **Step 3.2: Manual verification — does the LLM follow the warm with: instruction?**
|
||||
|
||||
Start a new OpenCode session with any tier role. Observe whether the LLM:
|
||||
1. Reads the preset file at `conductor/directives/presets/current_baseline.md`
|
||||
2. Reads each variant file listed in the preset
|
||||
3. Has the directives in context for the session
|
||||
|
||||
This is the "test" — there's no automated test for this. The signal is: does the LLM behave as if it has read the directives?
|
||||
|
||||
- [ ] **Step 3.3: Write end-of-track report**
|
||||
|
||||
**File:** `docs/reports/TRACK_COMPLETION_directive_hotswap_harness_20260627.md`
|
||||
|
||||
Document:
|
||||
- What shipped (48 directives + baseline preset + 5 role-prompt updates)
|
||||
- The directory structure
|
||||
- The preset format
|
||||
- The `warm with:` bootstrap
|
||||
- How to hot-swap (create a new preset or tell the LLM "warm with: <path>")
|
||||
- What's NOT included (no scripts, no TOML, no v2+ variants yet)
|
||||
- Handoff to future tracks (alternative encoding authoring, Manual Slop integration, token-cost analysis)
|
||||
|
||||
- [ ] **Step 3.4: Commit the end-of-track report**
|
||||
|
||||
```bash
|
||||
git add docs/reports/TRACK_COMPLETION_directive_hotswap_harness_20260627.md
|
||||
git commit -m "docs(reports): TRACK_COMPLETION_directive_hotswap_harness_20260627"
|
||||
```
|
||||
@@ -0,0 +1,230 @@
|
||||
# Design: Directive Hot-Swap Harness (OpenCode Directive Presets)
|
||||
|
||||
**Date:** 2026-06-27
|
||||
**Status:** Draft — pending user review
|
||||
**Track ID (proposed):** `directive_hotswap_harness_20260627`
|
||||
|
||||
## Problem
|
||||
|
||||
The codebase's directives — the instructions that tell LLMs how to behave (banned patterns, conventions, hard bans, anti-patterns) — are scattered across the entire doc tree: `AGENTS.md`, `conductor/workflow.md`, `conductor/product-guidelines.md`, `conductor/tech-stack.md`, every `conductor/code_styleguides/*.md`, `docs/Readme.md`, `docs/AGENTS.md`, all 14 `docs/guide_*.md`, etc. They're embedded in prose, tables, anti-pattern sections, "Critical Anti-Patterns" lists, "Hard Rules," styleguide sections.
|
||||
|
||||
The 4 tier role prompts (`.opencode/agents/tier1-orchestrator.md`, `tier2-tech-lead.md`, `tier3-worker.md`, `tier4-qa.md`) plus the autonomous variant (`conductor/tier2/agents/tier2-autonomous.md`) currently hardcode a list of ~11 files to read before any action. This list is static — every session gets the same directives regardless of the task. There's no mechanism to:
|
||||
- Test whether an alternative encoding of the same directive (imperative-ban vs. rationale-first vs. before/after) produces better LLM compliance
|
||||
- Hot-swap which encoding is active without manually editing files or navigating the filesystem
|
||||
- Exercise per-session control over which directives the LLM warms up with
|
||||
|
||||
## Goal
|
||||
|
||||
Build a **directive hot-swap harness** that lets the user:
|
||||
1. Maintain multiple alternative encodings ("variants") of the same directive as separate files
|
||||
2. Compose active directive sets into named "presets" (markdown bills of materials)
|
||||
3. Hot-swap which preset is active via a single `warm with: <path>` instruction in the role prompt or session message
|
||||
4. Use the existing file-reading behavior LLMs already have — no scripts, no TOML, no build steps
|
||||
|
||||
## Design
|
||||
|
||||
### The directive directory structure
|
||||
|
||||
```
|
||||
conductor/directives/
|
||||
<directive_name>/
|
||||
v1.md ← the baseline encoding (verbatim lift from current docs)
|
||||
v2_<style>.md ← alternative encodings (added over time)
|
||||
presets/
|
||||
current_baseline.md ← the default preset (all v1)
|
||||
<experimental>.md ← alternative presets (added over time)
|
||||
```
|
||||
|
||||
**Naming convention:** lowercase, underscore-separated, action-oriented (`ban_dict_any`, not `dict_str_any_ban`). The name describes the directive's intent.
|
||||
|
||||
**Variant file format:** each `vN.md` has a short header annotating why this iteration exists, then the directive text:
|
||||
|
||||
```markdown
|
||||
# <directive_name> — v1
|
||||
|
||||
**Why this iteration:** Lifted verbatim from `conductor/code_styleguides/python.md` §17.1.
|
||||
This is the baseline encoding — the imperative-ban style currently in production.
|
||||
Future variants will test alternative encodings against this baseline.
|
||||
|
||||
---
|
||||
|
||||
<directive text>
|
||||
```
|
||||
|
||||
### The preset format
|
||||
|
||||
A preset is a markdown bill of materials. It tells the LLM which directive variant files to read for this run. Nothing more.
|
||||
|
||||
```markdown
|
||||
# Preset: current_baseline
|
||||
|
||||
The baseline directive composition — all v1 variants lifted from the current
|
||||
production docs.
|
||||
|
||||
## Directives to warm
|
||||
|
||||
Read each file below before any action.
|
||||
|
||||
- ban_dict_any: conductor/directives/ban_dict_any/v1.md
|
||||
- ban_optional_returns: conductor/directives/ban_optional_returns/v1.md
|
||||
- no_local_imports: conductor/directives/no_local_imports/v1.md
|
||||
- ...
|
||||
|
||||
## Notes
|
||||
|
||||
All v1 (verbatim lifts from current production docs). No alternative encodings
|
||||
tested yet. This preset is the control group for future experiments.
|
||||
```
|
||||
|
||||
**Key properties:**
|
||||
- **Flat list.** No nesting, no conditionals, no includes. The LLM reads the list, reads the files.
|
||||
- **Human-readable name.** `current_baseline`, `exploratory_rationale`, `minimal_tokens` — pick by name.
|
||||
- **Notes section.** Documents the hypothesis being tested. This is the experiment log, inline with the preset.
|
||||
- **Partial swaps.** Swap 2-3 directives to v2, leave the rest at v1. The preset makes the diff explicit.
|
||||
- **No script needed.** Author a new preset by copying an existing one and changing variant paths. Hot-swap by telling the LLM which preset to use.
|
||||
|
||||
### The role-prompt bootstrap
|
||||
|
||||
The 5 role prompts (`.opencode/agents/tier1-orchestrator.md`, `tier2-tech-lead.md`, `tier3-worker.md`, `tier4-qa.md`, and `conductor/tier2/agents/tier2-autonomous.md`) have a hardcoded "MANDATORY: Pre-Action Required Reading" section listing ~11 specific files. This is replaced with a single `warm with:` directive.
|
||||
|
||||
```markdown
|
||||
## MANDATORY: Directive Warm-up
|
||||
|
||||
warm with: conductor/directives/presets/current_baseline.md
|
||||
|
||||
Read the preset file above. It lists directive variant files to read before any action.
|
||||
Read each file the preset references. These are your active directives for this session.
|
||||
|
||||
If the user specifies a different preset (e.g., "warm with: conductor/directives/presets/exploratory_rationale.md"),
|
||||
use that instead. The user's instruction overrides the default.
|
||||
```
|
||||
|
||||
**Key properties:**
|
||||
- **One line is the bootstrap.** `warm with: <path>` is the entire mechanism.
|
||||
- **User override.** The user can tell the LLM "warm with: <path>" in their session message and it uses that preset instead of the default. This is the hot-swap — no file editing, just a text instruction.
|
||||
- **Per-role defaults.** Each tier role prompt can default to a different preset.
|
||||
- **Non-directive reads remain hardcoded.** Files that aren't tunable directives (e.g., `conductor/tracks/tier2_leak_prevention_20260620/spec.md`, `conductor/tier2/githooks/forbidden-files.txt`) stay as direct references in the role prompt.
|
||||
|
||||
### What stays in the role prompt (not directive-based)
|
||||
|
||||
- `AGENTS.md` — project operating rules (contains directives AND non-directive rules)
|
||||
- `conductor/workflow.md` — operational workflow
|
||||
- `conductor/edit_workflow.md` — edit tool contract
|
||||
- `conductor/tier2/githooks/forbidden-files.txt` — file denylist
|
||||
- The relevant `docs/guide_*.md` — architecture reference
|
||||
|
||||
These are context, not tunable directives. They stay hardcoded in the role prompt.
|
||||
|
||||
### The directive harvest
|
||||
|
||||
The directives are NOT limited to the 11 files the role prompts mandate. They're scattered across the entire doc tree. The track's first phase is a systematic harvest:
|
||||
|
||||
**A directive is any statement that tells the LLM:**
|
||||
- "Do X" / "Don't do X" (imperative)
|
||||
- "Use Y instead of Z" (preference)
|
||||
- "This is BANNED" (hard ban)
|
||||
- "Follow pattern P" (convention)
|
||||
- "Never do Q" (anti-pattern)
|
||||
|
||||
**NOT a directive:**
|
||||
- Descriptive prose ("The App class holds GUI state")
|
||||
- Architecture documentation ("Thread domains are separated by...")
|
||||
- Reference material ("The 45-tool inventory includes...")
|
||||
|
||||
**Sources to comb (non-exhaustive):**
|
||||
- `AGENTS.md` — "Critical Anti-Patterns", "File Size and Naming Convention", "Session-Learned Anti-Patterns", "Process Anti-Patterns"
|
||||
- `conductor/workflow.md` — "Code Style", "Guiding Principles", "Testing Requirements", "Known Pitfalls", "Process Anti-Patterns", "Tier 2 Autonomous Sandbox conventions"
|
||||
- `conductor/product-guidelines.md` — "Core Value", "Code Standards & Architecture", "Data-Oriented Error Handling", "Phase 5: Heavy Curation"
|
||||
- `conductor/tech-stack.md` — "Core Value" header
|
||||
- `conductor/code_styleguides/data_oriented_design.md` — §8.5 "Python Type Promotion Mandate", the 7-question simplification pass, the 10-question self-check
|
||||
- `conductor/code_styleguides/python.md` — §10 "Anti-OOP Conventions", §17 "LLM Default Anti-Patterns" (the 7 banned patterns)
|
||||
- `conductor/code_styleguides/error_handling.md` — the Result[T] convention, the AI Agent Checklist
|
||||
- `conductor/code_styleguides/type_aliases.md` — "When NOT to promote"
|
||||
- `conductor/code_styleguides/feature_flags.md` — "delete to turn off" convention
|
||||
- `conductor/code_styleguides/agent_memory_dimensions.md` — the 4-dimension decision tree
|
||||
- `conductor/code_styleguides/rag_integration_discipline.md` — "conservative-RAG rule"
|
||||
- `conductor/code_styleguides/cache_friendly_context.md` — stable-to-volatile ordering
|
||||
- `conductor/code_styleguides/knowledge_artifacts.md` — the harvest pattern
|
||||
- `docs/AGENTS.md` — "Convention Enforcement"
|
||||
- `docs/Readme.md` — any directive-like content in feature descriptions
|
||||
|
||||
**Granularity resolution:** the harvest produces a candidate list. Then the question of which directives to merge (e.g., `ban_prefix_aliasing` + `no_local_imports` might become `import_hygiene`), split, or keep standalone is resolved in the harvest phase — not locked in upfront.
|
||||
|
||||
### The original docs stay untouched
|
||||
|
||||
The `conductor/directives/` tree is a *parallel* structure, not a replacement. The original docs (`python.md`, `error_handling.md`, `AGENTS.md`, etc.) remain the canonical source until a future track deprecates them. The harness is useful immediately (the v1 variants are exact copies); the old docs are not broken.
|
||||
|
||||
### Why no scripts / TOML
|
||||
|
||||
The user explicitly rejected TOML manifests and scripts for this initial version: "no need to systematize that hard when I don't know what's going to work yet." The preset is markdown. The hot-swap is a text instruction. The variant selection is a path in a markdown file. No build steps, no generated files, no tooling dependencies. If the system proves useful, a future track can add automation (auto-generating presets from the directory tree, token-cost analysis per variant, automated compliance testing).
|
||||
|
||||
## Scope: Two Parallel Campaigns
|
||||
|
||||
The user's request bundles two distinct campaigns that share a theme ("how do you encode information densely for an LLM?") but are tracked and executed independently.
|
||||
|
||||
### Campaign A: Directive Hot-Swap Harness (this spec)
|
||||
|
||||
**Track A-1 (this):** directive harvest + scaffold + baseline preset + role-prompt bootstrap update. Gets the system working with v1 (current) encodings.
|
||||
|
||||
Future tracks in Campaign A:
|
||||
- Alternative encoding authoring (v2, v3 per directive — the actual experimentation)
|
||||
- Manual Slop integration (a "Directive Lab" panel for virtualized directive selection)
|
||||
- Token-cost analysis tooling
|
||||
- Automated compliance testing
|
||||
|
||||
### Campaign B: Video Analysis (4 new videos)
|
||||
|
||||
A separate research campaign following the established 3-pass pattern from the previous 12-video campaign (Pass 1: extract → Pass 2: deobfuscate → Pass 3: project to C11/Python). The 4 videos:
|
||||
|
||||
1. **Reinventing Entropy | Compression is Intelligence Part 1** (https://youtu.be/l6DKRf-fAAM)
|
||||
2. **Yann LeCun: World Models: Enabling the next AI revolution** (https://www.youtube.com/watch?v=72Xj8k5WQX4)
|
||||
3. **Yann LeCun's $1B Bet Against LLMs [Part 1]** (https://youtu.be/kYkIdXwW2AE)
|
||||
4. **Recursive Self-Improvement** (https://youtu.be/t7_ZXgfJVG8)
|
||||
|
||||
### Cross-Campaign Relationship
|
||||
|
||||
The two campaigns inform each other but have no hard dependency:
|
||||
|
||||
- **The video analysis informs directive encoding.** The entropy/compression video (video 1) provides theoretical grounding for how information density affects comprehension. LeCun's world-model work (videos 2-3) informs how LLMs model directive intent. Recursive self-improvement (video 4) is directly relevant to the meta-question of whether better directive encodings can be discovered iteratively. Insights from the video analysis may surface alternative encoding strategies to test in Campaign A's harness.
|
||||
|
||||
- **The harness informs the video analysis.** The previous video campaign produced a lexicon + C11 reference + deobfuscation DSL. The directive harness is itself a compression-aid tool — it encodes the same directive in fewer/different tokens and observes the effect. The harness's design (preset as bill-of-materials, variant as alternative encoding) is the same pattern as the video campaign's deobfuscation pass (same content, different encoding). The harness may inform how the video analysis encodes its own outputs.
|
||||
|
||||
- **Execution order:** the campaigns can run in parallel. Campaign A (Track A-1) is an engineering track; Campaign B is a research track. They don't share files. The cross-pollination is intellectual, not structural.
|
||||
|
||||
### The video analysis track structure (Campaign B)
|
||||
|
||||
Follows the established 3-pass pattern from `docs/reports/2026-06-15/CAMPAIGN_CLOSE_OUT_video_analysis_20260621.md`:
|
||||
|
||||
- **Pass 1:** Information extraction (4 deep-dive reports, one per video). Uses the existing `scripts/video_analysis/` pipeline (download_video, extract_transcript, extract_keyframes, ocr_frames, synthesize_report). The lexicon v2 from the previous campaign is the starting point for deobfuscation.
|
||||
- **Pass 2:** Deobfuscation (apply the lexicon v2 to the 4 new videos' content). May produce lexicon v3 corrections if the new videos surface notation the lexicon doesn't cover.
|
||||
- **Pass 3:** C11/Python projection (project each video's deobfuscated content to code in the user's idiomatic style).
|
||||
|
||||
The video analysis track is initialized as a separate conductor track (`video_analysis_campaign_2_20260627` or similar). Its spec/plan is authored separately from this design doc.
|
||||
|
||||
## Out of Scope (for Track A-1)
|
||||
|
||||
- **Authoring alternative encodings (v2+).** This track only creates v1 (verbatim lifts). The experimentation is a future activity.
|
||||
- **Deprecating the original docs.** The old docs stay as canonical source.
|
||||
- **Scripts for preset generation or variant selection.** No automation in this version.
|
||||
- **Manual Slop GUI integration.** The harness is OpenCode-only for now.
|
||||
- **Token-cost analysis.** No tooling to measure token cost per variant in this version.
|
||||
- **Automated compliance testing.** No test harness to measure LLM compliance per encoding.
|
||||
- **The 4-video analysis (Campaign B).** Separate track, separate campaign. This design doc covers Campaign A (the harness) only. The video analysis gets its own track spec.
|
||||
|
||||
## Risks
|
||||
|
||||
1. **Harvest completeness.** The directive harvest might miss directives embedded in prose. Mitigation: systematic combing of the doc tree + the user reviews the candidate list before variants are created.
|
||||
2. **Granularity ambiguity.** Some directives overlap (e.g., "ban dict[str, Any]" and "use typed dataclass fields" are two sides of the same coin). Mitigation: the harvest phase produces a candidate list; the granularity is resolved there, not upfront.
|
||||
3. **Role-prompt drift.** The 5 role prompts need to be updated consistently. Mitigation: the `warm with:` line is the only change; the rest of each role prompt is untouched.
|
||||
4. **Adoption friction.** LLMs might not follow the `warm with:` instruction reliably. Mitigation: the instruction is simple (read a file, read the files it lists) and uses the existing file-reading behavior the LLMs already have.
|
||||
|
||||
## See Also
|
||||
|
||||
- `conductor/tier2/agents/tier2-autonomous.md` — the role prompt that will be updated with `warm with:`
|
||||
- `conductor/tier2/commands/tier-2-auto-execute.md` — the slash command template
|
||||
- `conductor/code_styleguides/python.md` §17 — the primary source of directives to harvest
|
||||
- `conductor/code_styleguides/error_handling.md` — the Result[T] convention to harvest
|
||||
- `AGENTS.md` "Critical Anti-Patterns" — the hard bans to harvest
|
||||
- `docs/guide_meta_boundary.md` — the meta-tooling / application distinction (relevant to why this harness lives in the meta-tooling domain)
|
||||
- `docs/reports/2026-06-15/CAMPAIGN_CLOSE_OUT_video_analysis_20260621.md` — the previous video campaign's closeout (the pattern Campaign B follows)
|
||||
- `scripts/video_analysis/` — the existing video analysis pipeline (Campaign B reuses this)
|
||||
@@ -0,0 +1,68 @@
|
||||
# Track state for directive_hotswap_harness_20260627
|
||||
# Initialized by Tier 1 Orchestrator on 2026-06-27.
|
||||
# Implementation delegated to Tier 2 (autonomous) or Tier 3 worker dispatch.
|
||||
# This is Track 1 of Campaign A (Directive Encoding Campaign).
|
||||
|
||||
[meta]
|
||||
track_id = "directive_hotswap_harness_20260627"
|
||||
name = "Directive Hot-Swap Harness (OpenCode Directive Presets)"
|
||||
status = "active"
|
||||
current_phase = 0
|
||||
last_updated = "2026-06-27"
|
||||
|
||||
[blocked_by]
|
||||
# None. Pure documentation/track-artifact work; no code changes, no tests,
|
||||
# zero overlap with any running track.
|
||||
|
||||
[blocks]
|
||||
directive_encoding_experiments = "planned (future; v2+ variant authoring)"
|
||||
manual_slop_directive_lab = "planned (future; GUI integration)"
|
||||
|
||||
[phases]
|
||||
phase_1 = { status = "pending", checkpointsha = "", name = "Directive Harvest (10 steps: 48 directives from doc tree into conductor/directives/)" }
|
||||
phase_2 = { status = "pending", checkpointsha = "", name = "Baseline Preset + Role-Prompt Bootstrap (8 steps: preset + 5 role-prompt warm with: updates)" }
|
||||
phase_3 = { status = "pending", checkpointsha = "", name = "Verification + End-of-Track (4 steps: dir structure verify + manual LLM verify + report + commit)" }
|
||||
|
||||
[tasks]
|
||||
# Phase 1: directive harvest
|
||||
t1_1 = { status = "pending", commit_sha = "", description = "Harvest 17.1-17.7 banned patterns (7 directives: ban_dict_any, ban_any_type, ban_optional_returns, ban_hasattr_dispatch, ban_getattr_dispatch, ban_dict_get_on_known_fields, boundary_layer_exception)" }
|
||||
t1_2 = { status = "pending", commit_sha = "", description = "Harvest 17.9 import/aliasing bans (3 directives: ban_local_imports, ban_prefix_aliasing, ban_repeated_from_dict)" }
|
||||
t1_3 = { status = "pending", commit_sha = "", description = "Harvest error handling conventions (2 directives: result_error_pattern, nil_sentinel_pattern)" }
|
||||
t1_4 = { status = "pending", commit_sha = "", description = "Harvest type/data-structure conventions (3 directives: typed_dataclass_fields, metadata_boundary_type, update boundary_layer_exception)" }
|
||||
t1_5 = { status = "pending", commit_sha = "", description = "Harvest code style directives (5 directives: one_space_indent, no_comments_in_body, no_diagnostic_noise, type_hints_required, sdm_dependency_tags)" }
|
||||
t1_6 = { status = "pending", commit_sha = "", description = "Harvest file/taxonomy conventions (3 directives: file_naming_convention, no_new_src_files_without_permission, large_files_are_fine)" }
|
||||
t1_7 = { status = "pending", commit_sha = "", description = "Harvest process/workflow directives (10 directives: atomic_per_task_commits, tdd_red_green_required, ban_arbitrary_core_mocking, live_gui_poll_not_sleep, batch_verification_not_isolation, git_hard_bans, ban_day_estimates, no_output_filtering, prefer_targeted_tier_runs, mandatory_research_first)" }
|
||||
t1_8 = { status = "pending", commit_sha = "", description = "Harvest process anti-patterns (6 directives: no_skip_markers_as_avoidance, deduction_loop_limit, report_instead_of_fix_ban, scope_creep_track_doc_ban, inherited_cruft_ask_first, verbose_commit_message_ban)" }
|
||||
t1_9 = { status = "pending", commit_sha = "", description = "Harvest GUI/architecture directives (5 directives: imgui_scope_verification, modular_controller_pattern, ui_delegation_for_hot_reload, strict_state_management, comprehensive_logging)" }
|
||||
t1_10 = { status = "pending", commit_sha = "", description = "Harvest feature-flag + RAG + cache + knowledge directives (4 directives: feature_flag_delete_to_turn_off, rag_six_rules, cache_stable_to_volatile, knowledge_harvest_pattern)" }
|
||||
t1_11 = { status = "pending", commit_sha = "", description = "Commit the directive harvest (48 files)" }
|
||||
# Phase 2: baseline preset + role-prompt bootstrap
|
||||
t2_1 = { status = "pending", commit_sha = "", description = "Create conductor/directives/presets/current_baseline.md (48 directives listed)" }
|
||||
t2_2 = { status = "pending", commit_sha = "", description = "Commit the baseline preset" }
|
||||
t2_3 = { status = "pending", commit_sha = "", description = "Update .opencode/agents/tier1-orchestrator.md with warm with: bootstrap" }
|
||||
t2_4 = { status = "pending", commit_sha = "", description = "Update .opencode/agents/tier2-tech-lead.md with warm with: bootstrap" }
|
||||
t2_5 = { status = "pending", commit_sha = "", description = "Update .opencode/agents/tier3-worker.md with warm with: bootstrap" }
|
||||
t2_6 = { status = "pending", commit_sha = "", description = "Update .opencode/agents/tier4-qa.md with warm with: bootstrap" }
|
||||
t2_7 = { status = "pending", commit_sha = "", description = "Update conductor/tier2/agents/tier2-autonomous.md with warm with: bootstrap" }
|
||||
t2_8 = { status = "pending", commit_sha = "", description = "Commit the 5 role-prompt updates" }
|
||||
# Phase 3: verification + end-of-track
|
||||
t3_1 = { status = "pending", commit_sha = "", description = "Verify directory structure (48 dirs, 48 v1.md files, preset exists, 5 role prompts have warm with:)" }
|
||||
t3_2 = { status = "pending", commit_sha = "", description = "Manual verification: does the LLM follow the warm with: instruction?" }
|
||||
t3_3 = { status = "pending", commit_sha = "", description = "Write docs/reports/TRACK_COMPLETION_directive_hotswap_harness_20260627.md" }
|
||||
t3_4 = { status = "pending", commit_sha = "", description = "Commit the end-of-track report" }
|
||||
|
||||
[verification]
|
||||
phase_1_complete = false
|
||||
phase_2_complete = false
|
||||
phase_3_complete = false
|
||||
directive_count = 48
|
||||
preset_exists = false
|
||||
role_prompts_updated = false
|
||||
|
||||
[campaign_context]
|
||||
campaign_name = "Directive Encoding Campaign (Campaign A)"
|
||||
track_1 = "directive_hotswap_harness_20260627 (THIS; harvest + scaffold + baseline preset + role-prompt bootstrap)"
|
||||
track_2 = "directive_encoding_experiments (future; v2+ variant authoring + preset experimentation)"
|
||||
track_3 = "manual_slop_directive_lab (future; GUI integration)"
|
||||
sibling_campaign = "Video Analysis Campaign 2 (Campaign B; 4 new videos; separate track)"
|
||||
cross_campaign_relationship = "Intellectual cross-pollination; no hard dependency."
|
||||
@@ -0,0 +1,109 @@
|
||||
{
|
||||
"track_id": "enforcement_gap_closure_20260627",
|
||||
"name": "Enforcement Gap Closure (Boundary-Layer Audit + Optional[T] Audit Widening)",
|
||||
"status": "active",
|
||||
"branch": "master",
|
||||
"created": "2026-06-27",
|
||||
"owner": "Tier 1 (initialized); implementation delegated to Tier 2/3.",
|
||||
"blocked_by": [],
|
||||
"blocks": [],
|
||||
"scope": {
|
||||
"new_files": [
|
||||
"scripts/audit_boundary_layer.py",
|
||||
"scripts/boundary_layer_allowlist.toml",
|
||||
"scripts/audit_optional_returns.py (renamed from audit_optional_in_3_files.py)",
|
||||
"scripts/audit_optional_returns.baseline.json",
|
||||
"tests/test_audit_boundary_layer.py",
|
||||
"tests/test_audit_optional_returns.py",
|
||||
"docs/reports/TRACK_COMPLETION_enforcement_gap_closure_20260627.md"
|
||||
],
|
||||
"modified_files": [
|
||||
"conductor/code_styleguides/python.md (sections 17.7, 17.8, inventory table 449-456)",
|
||||
"conductor/code_styleguides/error_handling.md (cross-reference sweep only)",
|
||||
"docs/AGENTS.md (cross-reference sweep only)",
|
||||
"conductor/tracks.md (active-track row + status)",
|
||||
"conductor/chronology.md (prepend shipment row)"
|
||||
],
|
||||
"deleted_files": [
|
||||
"scripts/audit_optional_in_3_files.py (renamed to audit_optional_returns.py via git mv)"
|
||||
]
|
||||
},
|
||||
"estimated_effort": {
|
||||
"method": "scope (per workflow.md Tier 1 Track Initialization Rules. NO day estimates.)",
|
||||
"phase_1": "4 tasks: 1 test file (10 tests) + 1 audit script + 1 allowlist TOML + green-phase verification",
|
||||
"phase_2": "3 tasks: 1 test file (5 tests) + 1 rename/edit + 1 baseline JSON + green-phase verification",
|
||||
"phase_3": "2 tasks: 1 styleguide inventory edit + 1 cross-reference sweep",
|
||||
"phase_4": "4 tasks: 7-audit verification + 1 end-of-track report + 1 state update + user sign-off"
|
||||
},
|
||||
"verification_criteria": [
|
||||
"G1: scripts/audit_boundary_layer.py exists + AST-scans all src/*.py + exits 1 in --strict on un-allowlisted dict[str, Any] sites",
|
||||
"G2: scripts/boundary_layer_allowlist.toml exists + lists ~14 boundary files with reasons + --show-allowlist prints them",
|
||||
"G3: scripts/audit_optional_returns.py exists (renamed from audit_optional_in_3_files.py) + scans all src/*.py + 3 history.py residuals baselined in audit_optional_returns.baseline.json (strict stays green)",
|
||||
"G4: conductor/code_styleguides/python.md sections 17.7, 17.8, and inventory table reflect post-track reality (audit_boundary_layer implemented; audit_optional_returns implemented; audit_imports implemented)",
|
||||
"G5: cross-reference sweep complete (no enforcement-instruction references to audit_optional_in_3_files.py; historical references preserved)",
|
||||
"G6: tests/test_audit_boundary_layer.py has >=10 tests; all pass",
|
||||
"G7: tests/test_audit_optional_returns.py has >=5 tests; all pass",
|
||||
"G8: docs/reports/TRACK_COMPLETION_enforcement_gap_closure_20260627.md exists; documents contradiction closure (C1, C2, C3-partial, C18-partial, C21) and remaining (C5, C6, C16, C17 - deferred per user directive)",
|
||||
"VC_pre_commit_parallel_safe": "ZERO file overlap with the running tier2/post_module_taxonomy_de_cruft_20260627 branch (verified by Tier 1 against ddcec7b0 + TRACK_COMPLETION file-level changes)"
|
||||
],
|
||||
"regressions_and_pre_existing_failures": [],
|
||||
"pre_existing_failures_remaining": [],
|
||||
"deferred_to_followup_tracks": [
|
||||
{
|
||||
"title": "Optional[T] return migration in src/history.py",
|
||||
"description": "3 RETURN_OPTIONAL sites in src/history.py baselined by this track; cruft_elimination_20260627 Phase 6 owns the migration to Result[T] + NIL_T.",
|
||||
"track_status": "planned in cruft_elimination_20260627"
|
||||
},
|
||||
{
|
||||
"title": "dict[str, Any] migration in hot_reloader.py + startup_profiler.py",
|
||||
"description": "2 un-allowlisted boundary violations baselined by this track; a future track promotes them to typed dataclasses (HotReloadSnapshot, ProfilerSnapshot).",
|
||||
"track_status": "not yet initialized"
|
||||
},
|
||||
{
|
||||
"title": "Main-repo pre-commit hook wiring",
|
||||
"description": "The 5 audit scripts strict mode (weak_types, boundary_layer, optional_returns, exception_handling, imports) is not wired into the main repo's .git/hooks/. Per contradictions report C4.",
|
||||
"track_status": "not yet initialized"
|
||||
},
|
||||
{
|
||||
"title": "Docs-count drift in docs/Readme.md (C7, C8, C9) + styleguide drift (C16 python.md s10, C17 type_aliases.md line 19) + RAGChunk.id in guides (C6)",
|
||||
"description": "Deferred per user directive 2026-06-27 until tier2 branch stabilizes; these describe code state that exists post-merge of the taxonomy branches.",
|
||||
"track_status": "deferred; will bundle into a docs-sync track post-merge"
|
||||
}
|
||||
],
|
||||
"risk_register": [
|
||||
{
|
||||
"id": "R1",
|
||||
"description": "audit_optional_returns.baseline.json format mismatch with audit_weak_types.baseline.json contract",
|
||||
"likelihood": "medium",
|
||||
"impact": "the renamed --strict mode behaves inconsistently with the existing baseline pattern",
|
||||
"mitigation": "Tier 3 reads scripts/audit_weak_types.py + its baseline JSON before implementing; mirror the exact contract"
|
||||
},
|
||||
{
|
||||
"id": "R2",
|
||||
"description": "Cross-file rename race if Tier 2 branch touches scripts/audit_optional_in_3_files.py in parallel",
|
||||
"likelihood": "low",
|
||||
"impact": "the git mv conflicts with Tier 2 work",
|
||||
"mitigation": "Tier 1 verified post_module_taxonomy_de_cruft TRACK_COMPLETION does not touch audit_optional_*; only scripts/audit_no_models_config_io.py"
|
||||
},
|
||||
{
|
||||
"id": "R3",
|
||||
"description": "Boundary allowlist under-classifies a genuine violation as boundary (false negative)",
|
||||
"likelihood": "medium",
|
||||
"impact": "the audit misses a real dict[str, Any] escape hatch that future LLMs reach for",
|
||||
"mitigation": "Tier 1's spec 'Current State Audit' manually classified the 14 legitimate boundary files + 2 genuine violators; the audit starts from that classification. Reviewer (user) inspects boundary_layer_allowlist.toml before merge."
|
||||
},
|
||||
{
|
||||
"id": "R4",
|
||||
"description": "Over-classification: audit flags a genuine boundary function as a violation (false positive)",
|
||||
"likelihood": "low",
|
||||
"impact": "strict mode is red on a real boundary file; either the allowlist is amended (correct fix) or the violation is suppressed (wrong fix, masks drift)",
|
||||
"mitigation": "Per spec FR1, allowlisting is the explicit 'declare your boundary' mechanism; the reviewer audits the allowlist at merge time. The audit's `--no-allowlist` mode exposes every site so reviewers can spot-check classifications."
|
||||
}
|
||||
],
|
||||
"contradictions_report_cross_reference": {
|
||||
"source": "docs/reports/CONTRADICTIONS_REPORT_20260627.md",
|
||||
"closes": ["C1", "C2", "C3_partial", "C18_partial", "C21"],
|
||||
"defers": ["C5", "C6", "C7", "C8", "C9", "C11", "C12", "C13", "C14", "C15", "C16", "C17", "C19", "C20"],
|
||||
"rationale": "C1+C2+C21 are about the Optional audit name+scope (closed by Phase 2 rename+widen). C3-partial is 'audit_imports.py planned but exists' (closed by Phase 3 inventory correction). C18-partial is the audit count (closed by Phase 3). The 14 deferred items are docs-sync (C5-C9, C16, C17) or status drift (C11-C15, C19, C20) that per user directive 2026-06-27 wait for the tier2 taxonomy branch to stabilize before touching master's docs."
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,172 @@
|
||||
# Plan: Enforcement Gap Closure (Boundary-Layer Audit + Optional[T] Audit Widening)
|
||||
|
||||
Track: `enforcement_gap_closure_20260627`
|
||||
Branch: master (parallel-safe against `tier2/post_module_taxonomy_de_cruft_20260627`)
|
||||
Spec: `conductor/tracks/enforcement_gap_closure_20260627/spec.md`
|
||||
|
||||
This plan is read by a Tier 3 Worker (or Tier 2). All Python edits MUST use 1-space indentation. No comments in body. CRLF preserved via `manual-slop_edit_file` MCP tool (never native `edit`).
|
||||
|
||||
**Audit-then-specify verification done by Tier 1:** All file:line references below were verified against master at `77b70226` on 2026-06-27.
|
||||
|
||||
---
|
||||
|
||||
## Phase 1: Boundary-Layer Audit Script
|
||||
|
||||
Focus: Implement `scripts/audit_boundary_layer.py` + `scripts/boundary_layer_allowlist.toml` + tests, mirroring the `audit_imports.py` + `audit_imports_whitelist.toml` contract.
|
||||
|
||||
- [ ] Task 1.1: Write failing tests for `scripts/audit_boundary_layer.py`
|
||||
- **WHERE:** `tests/test_audit_boundary_layer.py` (NEW file)
|
||||
- **WHAT:** 10 tests per spec FR5 (finder detects `dict[str, Any]` in return / param / local; allowlist suppression + WHITELISTED annotation; `--strict` exit 1 on un-allowlisted; `--strict` exit 0 on allowlisted; `--json` shape; missing-file handling; syntax-error handling; `--show-allowlist`).
|
||||
- **HOW:** Use `tmp_path` (or `tests/artifacts/` per workspace_paths.md — see workflow.md "Test Sandbox Hardening") to create a synthetic `src/` tree the audit can scan via a `--src` flag (mirror `audit_weak_types.py --src`). Each test creates 1-2 small .py files with the pattern under test, invokes the audit via `subprocess.run(["python", "scripts/audit_boundary_layer.py", "--src", str(tmp_src), ...])`, asserts on stdout + exit code. Tests MUST fail before the script exists (Red phase).
|
||||
- **SAFETY:** No `live_gui` fixture (these are unit tests of a script). No `unittest.mock.patch` of core code. Use `monkeypatch.setenv` for the `--src` path or pass via argv.
|
||||
- **COMMIT:** `test(audit): add 10 failing tests for boundary-layer audit`
|
||||
- **GIT NOTE:** Red-phase tests for `scripts/audit_boundary_layer.py`; cover finder + allowlist + strict + json + error-handling per spec FR1 + FR5.
|
||||
|
||||
- [ ] Task 1.2: Implement `scripts/audit_boundary_layer.py`
|
||||
- **WHERE:** `scripts/audit_boundary_layer.py` (NEW file)
|
||||
- **WHAT:** Implement the audit per spec FR1. The structure mirrors `scripts/audit_imports.py` (309 lines): module docstring → argparse → `audit_file(path) -> list[Finding]` → main loop over `sorted(Path(src).glob("*.py"))` → exit code logic.
|
||||
- **HOW:** Reuse the `audit_optional_in_3_files.py` AST detector pattern (it already has `_annotation_is_optional_arg` — copy the analogous `_is_dict_str_any` helper). Detection contract (FR1):
|
||||
1. Walk each `ast.FunctionDef` / `AsyncFunctionDef`:
|
||||
- If `node.returns` is `dict[str, Any]` (Subscript with value Name "dict"|"Dict" and slice Tuple `[Name "str", Name "Any"]`) → emit `RETURN_DICT_ANY`.
|
||||
- For each arg in `args.args + kwonlyargs + posonlyargs`: if `arg.annotation` is `dict[str, Any]` → emit `PARAM_DICT_ANY`.
|
||||
2. Walk each `ast.AnnAssign` inside a function body: if `target.annotation` is `dict[str, Any]` → emit `LOCAL_ANNOT_DICT_ANY`.
|
||||
3. Allowlist: load `scripts/boundary_layer_allowlist.toml` (use `tomllib.load`); for any file whose relative path is a key, suppress all findings for that file and emit a single `WHITELISTED` finding per file (matches `audit_imports.py` precedent).
|
||||
4. CLI flags: `--strict`, `--json`, `--show-allowlist`, `--no-allowlist`, `--src <path>` (default `"src"`).
|
||||
5. Default mode: print summary table (file, sites, allowlisted) + a list of violations; exit 0.
|
||||
6. `--strict`: same + exit 1 if there are un-allowlisted `RETURN_DICT_ANY` / `PARAM_DICT_ANY` / `LOCAL_ANNOT_DICT_ANY` findings.
|
||||
7. `--json`: print JSON `{files_scanned, files_with_findings, total_findings, by_kind, findings}` and exit 0.
|
||||
8. `--show-allowlist`: print the TOML contents + reasons; exit 0.
|
||||
9. `--no-allowlist`: do not read the TOML; audit all sites.
|
||||
- **SAFETY:** Pure stdlib (`ast`, `argparse`, `json`, `sys`, `pathlib.Path`, `tomllib`). No subprocess to `src/` files.
|
||||
- **COMMIT:** `feat(audit): implement audit_boundary_layer.py per FR1`
|
||||
- **GIT NOTE:** Implements the §17.7 boundary-layer audit; mirrors audit_imports.py contract; allowlist-driven per-file suppression.
|
||||
|
||||
- [ ] Task 1.3: Write `scripts/boundary_layer_allowlist.toml`
|
||||
- **WHERE:** `scripts/boundary_layer_allowlist.toml` (NEW file)
|
||||
- **WHAT:** Initial allowlist with the ~14 legitimate boundary files from spec "Current State Audit": `context_presets.py`, `events.py`, `openai_compatible.py`, `theme_models.py`, `log_registry.py`, `presets.py`, `tool_presets.py`, `personas.py`, `workspace_manager.py`, `paths.py`, `gemini_cli_adapter.py`, `mcp_client.py`, `type_aliases.py`, `session_logger.py`.
|
||||
- **HOW:** Mirror `audit_imports_whitelist.toml` format:
|
||||
- Header comment block (purpose + format).
|
||||
- "Last reviewed: 2026-06-27"
|
||||
- One `[allowlist."<relative_path>"]` entry per file with `reason = "..."` documenting why it's at the wire boundary (the reasons are documented in spec "Current State Audit" — e.g., context_presets = "project_dict is the wire TOML"; events.to_dict = "wire serialization for WS protocol"; etc.).
|
||||
- **SAFETY:** Pure TOML; no code.
|
||||
- **COMMIT:** `feat(audit): seed boundary_layer_allowlist.toml with 14 boundary files`
|
||||
- **GIT NOTE:** Allowlist seeds the §17.7 legitimate boundary; per audit_imports_whitelist.toml precedent.
|
||||
|
||||
- [ ] Task 1.4: Run tests for Phase 1 (Green phase)
|
||||
- **WHAT:** Execute `uv run pytest tests/test_audit_boundary_layer.py -v` (batched-runner convention can also be used: `uv run python scripts/run_tests_batched.py --filter test_audit_boundary_layer`). All 10 tests must pass. If any fail, debug (≤2 retries per workflow.md "Deduction Loop" rule), then STOP and report if still failing.
|
||||
- **COMMIT:** `conductor(state): mark Phase 1 task 1.4 verification` (or skip the commit if no code changes; just verify).
|
||||
- **GIT NOTE:** Green-phase verification for boundary-layer audit + allowlist.
|
||||
|
||||
---
|
||||
|
||||
## Phase 2: Optional[T] Audit Rename + Widening
|
||||
|
||||
Focus: Rename `audit_optional_in_3_files.py` → `audit_optional_returns.py`, widen from 4 files to all `src/*.py`, baseline the 3 `history.py` residuals.
|
||||
|
||||
- [ ] Task 2.1: Write failing tests for the renamed + widened audit
|
||||
- **WHERE:** `tests/test_audit_optional_returns.py` (NEW file)
|
||||
- **WHAT:** 5 tests per spec FR5: test_renamed_script_exists, test_scans_all_src_files, test_baseline_reading_keeps_strict_green, test_strict_exits_1_above_baseline, test_param_optional_is_warning_not_strict.
|
||||
- **HOW:** For test_scans_all_src_files, use `monkeypatch` + `--src <tmp_src>` flag (the script may need a `--src` flag added in Task 2.2 if it doesn't already have one — current `audit_optional_in_3_files.py` hardcodes the 4-file path; Task 2.2 adds `--src`). Tests must fail against the OLD script (which still hardcodes 4 files).
|
||||
- **SAFETY:** No `live_gui`. No core mocking.
|
||||
- **COMMIT:** `test(audit): add 5 failing tests for audit_optional_returns widening`
|
||||
- **GIT NOTE:** Red-phase tests for the rename + widening to all src/*.py per spec FR3 + FR5.
|
||||
|
||||
- [ ] Task 2.2: Rename + widen `audit_optional_in_3_files.py` → `audit_optional_returns.py`
|
||||
- **WHERE:** `git mv scripts/audit_optional_in_3_files.py scripts/audit_optional_returns.py` then edit the new file.
|
||||
- **WHAT:** Per spec FR3:
|
||||
1. `git mv` the file (preserves history).
|
||||
2. Edit `scripts/audit_optional_returns.py`:
|
||||
- Module docstring: drop "4 baseline files"; say "all `src/*.py` per §17 post-2026-06-27 widening (the successor to `audit_optional_in_3_files.py`, which was renamed + widened on 2026-06-27)."
|
||||
- Replace `BASELINE_FILES: tuple[str, ...] = (...)` with `def _discover_src_files(src_dir: str = "src") -> list[Path]: return sorted(Path(src_dir).glob("*.py"))`.
|
||||
- Update `main()` to iterate `_discover_src_files(args.src)` instead of the hardcoded tuple.
|
||||
- Add `--src <path>` arg (default `"src"`) mirroring `audit_weak_types.py`.
|
||||
- Update `--json` output's `"files_scanned"` field to reflect the glob count.
|
||||
3. Create `scripts/audit_optional_returns.baseline.json` recording the 3 `src/history.py` `RETURN_OPTIONAL` findings so `--strict` exits 0 on master (findings ≤ baseline). Format: same as `audit_weak_types.baseline.json` (a JSON object with a count or a list of `{file, line, function, kind}` entries that strict mode subtracts). The strict-mode logic: load baseline; subtract baseline findings from current findings; exit 1 if residuals > 0. (Mirror `audit_weak_types.py`'s `--strict` + baseline contract — read its source to confirm the exact subtraction mechanism.)
|
||||
- **SAFETY:** No `src/` edits. No tests/ edits except the new test file from Task 2.1.
|
||||
- **COMMIT:** `refactor(audit): rename audit_optional_in_3_files.py -> audit_optional_returns.py; widen to all src/*.py; baseline 3 history.py residuals`
|
||||
- **GIT NOTE:** Closes contradictions C1+C21 (script name) + C2 (Optional ban scope ambiguity); script name + scope + baseline now honest per §17 post-2026-06-27.
|
||||
|
||||
- [ ] Task 2.3: Run tests for Phase 2 (Green phase)
|
||||
- **WHAT:** `uv run pytest tests/test_audit_optional_returns.py -v`. All 5 tests must pass. If failures, ≤2 debug retries; then STOP.
|
||||
- **VERIFY:** Also run the existing audit_optional tests (if any reference the old name, update them — likely there are no callers other than `code_path_audit_20260607`'s historical references which don't run).
|
||||
- **COMMIT:** `conductor(state): mark Phase 2 task 2.3 verification` (or skip if no code changes).
|
||||
- **GIT NOTE:** Green-phase verification for the rename + widening.
|
||||
|
||||
---
|
||||
|
||||
## Phase 3: Styleguide Doc Reconciliation
|
||||
|
||||
Focus: Fix `python.md` §17 enforcement inventory + §17.8 section to match post-track reality. Close contradictions C3, C18 (audit_imports exists), C1+C21 (script renamed), C2 (scope clarified), C5 (Result notation — only if no branch-sensitivity; per spec OOS, this is C5 which is deferred — confirm during this phase).
|
||||
|
||||
- [ ] Task 3.1: Fix `python.md` §17 inventory table (lines 449-456) + §17.8 enforcement section (lines 357-362)
|
||||
- **WHERE:** `conductor/code_styleguides/python.md`
|
||||
- **WHAT:** Per spec FR4:
|
||||
1. Inventory table (lines 449-456): update the rows:
|
||||
- `dict[str, Any]` ban: ADD a row for `scripts/audit_boundary_layer.py --strict` (implemented this track; reads `boundary_layer_allowlist.toml`; `--no-allowlist` audits all). KEEP the existing `audit_weak_types.py --strict` row (they catch overlapping but distinct shapes — weak_types catches `Any` in any position; boundary_layer specifically targets `dict[str, Any]` in *signatures* outside the allowlisted boundary).
|
||||
- `Optional[T]` returns: change the row from "audit_optional_in_3_files.py covering 4 baseline files" to "audit_optional_returns.py --strict covering all src/*.py; reads audit_optional_returns.baseline.json for the 3 history.py residuals until cruft_elimination Phase 6". Mark "✅ implemented".
|
||||
- Local imports + `_PREFIX` aliasing + repeated `.from_dict()`: change `audit_imports.py` row to "✅ implemented" (was "⚠️ not yet built" — wrong; the script exists at `scripts/audit_imports.py`).
|
||||
- Repeated `.from_dict()`: drop "(no script planned; relies on Tier 2 review)" — covered by `audit_imports.py`.
|
||||
2. §17.8 enforcement section (lines 357-362): rewrite the bullets per spec FR4:
|
||||
- Bullet for `audit_optional_returns.py` → reflects rename + all-src scope.
|
||||
- Bullet for `audit_imports.py` → drop the "(planned per §17.9a)" parenthetical; mark as implemented.
|
||||
- Bullet for `audit_boundary_layer.py --strict` → replace the "boundary_layer audit (planned...)" bullet; describe the script + allowlist + `--no-allowlist` flag.
|
||||
- The "Pre-commit: every commit MUST pass all four audits above" line → "five audits above" (weak_types, boundary_layer, optional_returns, exception_handling, imports).
|
||||
- **HOW:** Use `manual-slop_edit_file` MCP tool. Verify exact line ranges via `manual-slop_get_file_slice` before editing (the line numbers above are approximate; the actual edit replaces a contiguous block). Preserve CRLF.
|
||||
- **SAFETY:** Pure doc edit. No code. No `src/` changes. No tests changes.
|
||||
- **COMMIT:** `docs(python.md): reconcile §17 inventory + §17.8 with post-track reality`
|
||||
- **GIT NOTE:** Closes C3 (audit_imports.py was "planned" but exists), C18 (audit count), C1+C21 reflected in doc; C2 scope clarified.
|
||||
|
||||
- [ ] Task 3.2: Cross-reference sweep for `audit_optional_in_3_files.py` references
|
||||
- **WHAT:** Use `manual-slop_py_find_usages` / `rg` to find ALL references to the old script name across `conductor/` and `docs/`. Per the spec, references likely exist in `error_handling.md:885` + `docs/AGENTS.md §"Convention Enforcement"`. For each reference:
|
||||
- If it's a historical/cross-reference note (e.g., "was `audit_optional_in_3_files.py`"), leave it.
|
||||
- If it's an enforcement-instruction reference (e.g., "run `uv run python scripts/audit_optional_in_3_files.py --strict`"), update to `audit_optional_returns.py`.
|
||||
- **COMMIT:** `docs: update audit_optional_in_3_files.py references to audit_optional_returns.py`
|
||||
- **GIT NOTE:** Historical references preserved (the rename history is documented in python.md:359); enforcement instructions updated.
|
||||
|
||||
---
|
||||
|
||||
## Phase 4: End-of-Track Report + State Update
|
||||
|
||||
- [ ] Task 4.1: Run the full 7-audit strict suite (gate verification)
|
||||
- **WHAT:** Execute all 7 audit scripts (now including the 2 new ones this track ships) in `--strict` mode:
|
||||
```
|
||||
uv run python scripts/audit_weak_types.py --strict
|
||||
uv run python scripts/audit_boundary_layer.py --strict
|
||||
uv run python scripts/audit_optional_returns.py --strict
|
||||
uv run python scripts/audit_exception_handling.py --strict
|
||||
uv run python scripts/audit_imports.py --strict
|
||||
uv run python scripts/audit_main_thread_imports.py
|
||||
uv run python scripts/audit_no_models_config_io.py
|
||||
```
|
||||
Expected: all pass (the boundary audit's 2 residuals `hot_reloader.py` + `startup_profiler.py` MUST be in the baseline JSON or the allowlist — verify before this step). The Optional audit's 3 `history.py` residuals are in `audit_optional_returns.baseline.json` (created in Phase 2).
|
||||
- **VERIFY:** If any audit fails, fix the baseline OR the allowlist. Do NOT mask a real violation; document the residual in the end-of-track report instead.
|
||||
- **COMMIT:** `test(audit): verify all 7 audit gates pass --strict post-track`
|
||||
- **GIT NOTE:** The 7-audit strict suite green; the 2 boundary + 3 Optional residuals baselined per spec.
|
||||
|
||||
- [ ] Task 4.2: Write end-of-track report
|
||||
- **WHERE:** `docs/reports/TRACK_COMPLETION_enforcement_gap_closure_20260627.md` (NEW file)
|
||||
- **WHAT:** Report following the precedent of `TRACK_COMPLETION_post_module_taxonomy_de_cruft_20260627.md`:
|
||||
- TL;DR
|
||||
- Phase summary (each phase + commits + status)
|
||||
- Verification Criteria status (mapped to spec G1-G8)
|
||||
- File-level changes (new + modified + renamed + new test files)
|
||||
- Commits log (atomic, ordered)
|
||||
- Audit gate status (all 7)
|
||||
- Contradictions closed (C1, C2, C3-partial, C18-partial, C21) and remaining (C5, C6, C16, C17 — deferred per user directive; cite spec OOS)
|
||||
- Known residuals: 2 boundary (`hot_reloader.py`, `startup_profiler.py`) + 3 Optional (`src/history.py`); these are baselined + owned by future tracks
|
||||
- Next steps for the user (review + the recommended follow-up track)
|
||||
- **COMMIT:** `docs(reports): TRACK_COMPLETION_enforcement_gap_closure_20260627`
|
||||
- **GIT NOTE:** End-of-track report; documents contradiction closure + residual baselines.
|
||||
|
||||
- [ ] Task 4.3: Update `conductor/tracks.md` + `conductor/chronology.md` + `conductor/tracks/enforcement_gap_closure_20260627/state.toml`
|
||||
- **WHAT:**
|
||||
1. `state.toml`: mark all phases "completed" with their checkpoint SHA; set `status = "completed"` + `current_phase = "complete"`.
|
||||
2. `conductor/tracks.md`: add a row to the Active Tracks table for this track (status "shipped"); or per the convention of recent tracks, the row is added when the track is initiated and the status updated when shipped.
|
||||
3. `conductor/chronology.md`: prepend a row for `2026-06-27 | enforcement_gap_closure_20260627 | shipped | summary...` at the top of the table.
|
||||
- **COMMIT:** `conductor(state): enforcement_gap_closure_20260627 SHIPPED + TRACK_COMPLETION`
|
||||
- **GIT NOTE:** Track state + chronology + tracks.md closed out.
|
||||
|
||||
- [ ] Task 4.4: Conductor - User Manual Verification (Protocol in workflow.md)
|
||||
- **WHAT:** Per the workflow.md "Phase Completion Verification and Checkpointing Protocol", present the results to the user for confirmation. Present: the 7-audit strict pass result, the test count, the contradictions closed, and the residual baselines. PAUSE for user sign-off.
|
||||
- **COMMIT:** (no commit; this is the user-confirmation gate)
|
||||
- **GIT NOTE:** User sign-off record.
|
||||
@@ -0,0 +1,433 @@
|
||||
# Track Specification: Enforcement Gap Closure (Boundary-Layer Audit + Optional[T] Audit Widening)
|
||||
|
||||
## Overview
|
||||
|
||||
Close the two genuine enforcement gaps in the 7-banned-pattern mandate documented in
|
||||
`conductor/code_styleguides/python.md` §17 (the LLM Default Anti-Patterns):
|
||||
|
||||
1. **The boundary-layer audit** — the script that enforces "no `dict[str, Any]`
|
||||
outside the 2-3 wire-parse functions per file" (`python.md` §17.7). Currently
|
||||
marked "⚠️ not yet built" in the §17 enforcement inventory (`python.md:454`),
|
||||
though the cruft_elimination_20260627 Phase 10 only produced a *report*
|
||||
(`docs/reports/boundary_layer_20260628.md`) — never the *audit script*. This
|
||||
is the one that prevents the next LLM from reaching for `dict[str, Any]` in
|
||||
`app_controller.py` again.
|
||||
|
||||
2. **The `audit_optional_in_3_files.py` rename + widening** — the script
|
||||
currently named `audit_optional_in_3_files.py` actually checks 4 files
|
||||
(the contradictions report C1+C21) and only enforces the `Optional[T]` ban
|
||||
on those 4 baseline files. `python.md:359` already references a successor
|
||||
`audit_optional_returns.py` (claimed "✅ implemented" in the inventory at
|
||||
`python.md:452`) but the rename never happened and the script never widened
|
||||
to all `src/*.py`. This track lands reality on both the script and the doc.
|
||||
|
||||
Both pieces are parallel-safe against the running `post_module_taxonomy_de_cruft_20260627`
|
||||
Tier 2 work: this track touches only `scripts/audit_*`, `scripts/*.toml` (allowlists),
|
||||
`conductor/code_styleguides/python.md` (the inventory table), and new `tests/test_*`
|
||||
files. Zero overlap with `src/models.py`, `tests/test_models*`, `src/api_hooks.py`,
|
||||
`scripts/audit_no_models_config_io.py`, or anything else Tier 2 is modifying.
|
||||
|
||||
## Current State Audit (as of master `77b70226`, branch `tier2/post_module_taxonomy_de_cruft_20260627` `ddcec7b0`)
|
||||
|
||||
### Already Implemented (DO NOT re-implement)
|
||||
|
||||
- `scripts/audit_weak_types.py` (388 lines) — flags `dict[str, Any]`, `Any`,
|
||||
anonymous tuple returns; informational default + `--strict` CI gate; reads
|
||||
`scripts/audit_weak_types.baseline.json`. **Implemented, working.** Covers
|
||||
§17.1 (`dict[str, Any]` / `Any` ban) and §17.2 (anonymous tuples) globally.
|
||||
|
||||
- `scripts/audit_exception_handling.py` (~500 lines) — classifies
|
||||
`try/except/finally/raise` sites into 10 categories; informational default +
|
||||
`--strict` CI gate. **Implemented, working.** Covers §17.3 (silent swallow /
|
||||
broad catch) globally.
|
||||
|
||||
- `scripts/audit_imports.py` (309 lines) — flags local imports (§17.9a),
|
||||
`_PREFIX` aliasing (§17.9b), and repeated `.from_dict()` (§17.9c);
|
||||
informational default + `--strict` CI gate; reads
|
||||
`scripts/audit_imports_whitelist.toml` for vendor-SDK-warmup + hot-reload
|
||||
per-file exemptions. **Implemented, working** (despite `python.md:455-456`
|
||||
marking it "not yet built" — a doc drift this track fixes). Covers §17.9
|
||||
fully.
|
||||
|
||||
- `scripts/audit_imports_whitelist.toml` (81 lines) — per-file whitelist with
|
||||
`reason` field + "Last reviewed" header. **The precedent template** for the
|
||||
new `boundary_layer_allowlist.toml` this track creates.
|
||||
|
||||
- `scripts/audit_optional_in_3_files.py` (122 lines) — AST-scans 4 files
|
||||
(`src/mcp_client.py`, `src/ai_client.py`, `src/rag_engine.py`,
|
||||
`src/code_path_audit.py`); the `BASELINE_FILES` tuple at line 17-22 is the
|
||||
only thing pinning it to those files; the audit logic is generic
|
||||
(`_return_annotation_is_optional`, `_annotation_is_optional_arg`,
|
||||
`audit_file`). **Implementation 100% reusable; only the file glob +
|
||||
name + docs need to change.**
|
||||
|
||||
### Gaps to Fill (This Track's Scope)
|
||||
|
||||
- **GAP-1: No boundary-layer audit script exists.** `python.md:454` and
|
||||
`python.md:361` mark it "planned / not yet built". The
|
||||
`cruft_elimination_20260627` spec describes it at FR1 §72 ("Boundary Layer
|
||||
is EXACTLY 2 places") and G14 ("boundary layer is documented as exactly 2
|
||||
places") but only ever delivered a *report* (`boundary_layer_20260628.md`),
|
||||
never a *static audit*. Without this, the §17.7 contract ("2-3 boundary
|
||||
functions per file, everything else must be typed") is policy-without-teeth.
|
||||
|
||||
- **GAP-2: `audit_optional_in_3_files.py` name lies + scope is too narrow.**
|
||||
- It actually checks 4 files (mcp_client, ai_client, rag_engine,
|
||||
code_path_audit) but is named "_3_files".
|
||||
- It only covers those 4 baseline files. The §17 mandate requires
|
||||
`Optional[T]` return-types banned in *all* `src/*.py`.
|
||||
- `python.md:359` + `python.md:452` already promise an
|
||||
`audit_optional_returns.py` "covering all `src/*.py`" — but no such
|
||||
script exists. The doc claims reality that the code doesn't match.
|
||||
|
||||
- **GAP-3: `python.md` §17 inventory table is internally inconsistent.**
|
||||
Lines 451-456 mark `audit_imports.py` as "not yet built" (false — it exists)
|
||||
and `audit_optional_returns.py` as "implemented" (false — it doesn't exist;
|
||||
only the `audit_optional_in_3_files.py` does). This track corrects both rows
|
||||
to match post-track reality.
|
||||
|
||||
### Verified `dict[str, Any]` Distribution on master (the blast-radius for GAP-1)
|
||||
|
||||
Per the audit-style AST scan I ran on master at `77b70226` (full scan of all
|
||||
`src/*.py`):
|
||||
|
||||
| File | ret sites | param sites | has `from_dict` | calls tomllib/json.loads |
|
||||
|------|-----------|-------------|------------------|--------------------------|
|
||||
| src/theme_models.py | 2 | 2 | yes | yes |
|
||||
| src/context_presets.py | 0 | 3 | no | no |
|
||||
| src/log_registry.py | 2 | 1 | yes | yes |
|
||||
| src/hot_reloader.py | 1 | 1 | no | no |
|
||||
| src/mcp_client.py | 0 | 2 | yes | yes |
|
||||
| src/personas.py | 1 | 1 | yes | yes |
|
||||
| src/presets.py | 1 | 1 | no | yes |
|
||||
| src/tool_presets.py | 1 | 1 | yes | yes |
|
||||
| src/type_aliases.py | 1 | 1 | yes | no |
|
||||
| src/workspace_manager.py | 1 | 1 | yes | yes |
|
||||
| src/events.py | 1 | 0 | no | no |
|
||||
| src/gemini_cli_adapter.py | 1 | 0 | no | yes |
|
||||
| src/openai_compatible.py | 1 | 0 | no | no |
|
||||
| src/paths.py | 1 | 0 | no | yes |
|
||||
| src/session_logger.py | 0 | 1 | no | no |
|
||||
| src/startup_profiler.py | 1 | 0 | no | no |
|
||||
| ... 50 other `src/*.py` | 0 | 0 | (varies) | (varies) |
|
||||
|
||||
Totals: **12 `dict[str, Any]` returns + 16 params across 16 files**; ~50 other
|
||||
files have zero `dict[str, Any]` in signatures.
|
||||
|
||||
Per-file manual classification (the same kind of classification the
|
||||
`audit_imports_whitelist.toml` makes for hot-reload files):
|
||||
|
||||
- **LEGITIMATE BOUNDARY** (audit must allow): `context_presets.py`
|
||||
(`load_all/save_preset/delete_preset(project_dict: Dict[str, Any])` —
|
||||
`project_dict` IS the wire TOML), `events.py` `to_dict()` (wire
|
||||
serialization for the WS protocol), `openai_compatible.py`
|
||||
`_to_dict_tool_call(tc: ToolCall) -> dict[str, Any]` (converts typed
|
||||
`ToolCall` to vendor wire dict), `theme_models.py` (the schema is the wire
|
||||
for `.ini` rendering), `log_registry.py` (JSON-L log shape), `presets.py`,
|
||||
`tool_presets.py`, `personas.py`, `workspace_manager.py`, `paths.py`,
|
||||
`gemini_cli_adapter.py`, `mcp_client.py` (the MCP wire-protocol parsers),
|
||||
`type_aliases.py` (`from_dict(raw: dict[str, Any])` classmethods — the
|
||||
literal definition of boundary), `session_logger.py` (writes JSONL).
|
||||
- **GENUINE VIOLATIONS** (audit should flag, baseline captures them so
|
||||
strict stays green until a migration track fixes): `hot_reloader.py`
|
||||
(`capture_state`/`restore_state(app, ...) -> dict[str, Any]` — internal
|
||||
state, could be a `HotReloadSnapshot` dataclass), `startup_profiler.py`
|
||||
(`snapshot() -> dict[str, Any]` — could be a `ProfilerSnapshot` dataclass).
|
||||
|
||||
So the audit must:
|
||||
1. Find every `dict[str, Any]` in function signatures (param + return +
|
||||
annotated assignment) in every `src/*.py`.
|
||||
2. For each site, check whether its enclosing function is allowlisted in
|
||||
`scripts/boundary_layer_allowlist.toml` (per-file + per-function entries
|
||||
with a `reason` field, mirroring the `audit_imports_whitelist.toml`
|
||||
contract).
|
||||
3. Exit 1 in `--strict` mode on any *un*-allowlisted site.
|
||||
4. Emit a `WHITELISTED` annotation per allowlisted file so the user sees the
|
||||
audit considered it (mirrors the `audit_imports.py` precedent).
|
||||
5. Ship an initial `boundary_layer_allowlist.toml` listing the ~14 legitimate
|
||||
boundary files identified above, each with a `reason` field documenting
|
||||
why it's at the wire.
|
||||
|
||||
### Verified `Optional[T]` Return-Type Distribution on master (the blast-radius for GAP-2)
|
||||
|
||||
Same AST scan, but counting `Optional[X]` return annotations:
|
||||
- **Total `RETURN_OPTIONAL` violations: 3, in 1 file** (`src/history.py`)
|
||||
- **Total `PARAM_OPTIONAL` (warning only, never blocks strict): 119 across many files**
|
||||
— these are legal per `error_handling.md` ("argument types that may be
|
||||
`None` describe a caller choice, not a runtime failure").
|
||||
|
||||
So widening the audit from 4 files → all `src/*.py` surfaces **3 new strict
|
||||
violations** in `src/history.py`. The existing `audit_optional_in_3_files.py`
|
||||
already covers the 4 baseline files (all clean). This track adds the 3
|
||||
`history.py` sites to a new `audit_optional_returns.baseline.json` so the
|
||||
widened strict gate stays green until cruft_elimination Phase 6 (which owns
|
||||
those 3 sites) actually migrates them. The 3 sites are documented in the
|
||||
allowlist; they are NOT fixed by this track (out of scope; the fix belongs to
|
||||
the cruft_elimination Phase 6 Optional[T]-migration work).
|
||||
|
||||
## Goals
|
||||
|
||||
- **G1.** A working `scripts/audit_boundary_layer.py` that AST-scans all
|
||||
`src/*.py` for `dict[str, Any]` in function signatures (params, returns,
|
||||
annotated locals) and exits 1 in `--strict` mode on any un-allowlisted site.
|
||||
|
||||
- **G2.** A working `scripts/boundary_layer_allowlist.toml` that declares the
|
||||
legitimate boundary functions per file, each with a `reason` field, modeled
|
||||
on `audit_imports_whitelist.toml` (with `--show-allowlist` and
|
||||
`--no-allowlist` flags mirroring the imports whitelist precedent).
|
||||
|
||||
- **G3.** `audit_optional_in_3_files.py` renamed to
|
||||
`audit_optional_returns.py`, `BASELINE_FILES` replaced with a `src/*.py`
|
||||
glob, docstrings updated to drop the "3 files" fiction. The 3 `history.py`
|
||||
violations baselined in `audit_optional_returns.baseline.json` so strict
|
||||
stays green. Existing strict callers (`code_path_audit_20260607` referenced
|
||||
the old name — update or alias accordingly).
|
||||
|
||||
- **G4.** `python.md` §17 enforcement inventory (lines 449-456) corrected to
|
||||
match post-track reality: `audit_boundary_layer.py` implemented, the renamed
|
||||
`audit_optional_returns.py` "scans all `src/*.py`", `audit_imports.py`
|
||||
marked implemented (it already is), and the inventory's "Pre-commit: every
|
||||
commit MUST pass all four audits" line updated to "five audits" (or
|
||||
whatever the actual post-track count is).
|
||||
|
||||
- **G5.** `conductor/code_styleguides/error_handling.md` and
|
||||
`conductor/code_styleguides/python.md` references to the renamed script
|
||||
updated (any line saying `audit_optional_in_3_files.py` ->
|
||||
`audit_optional_returns.py`, except the one legacy cross-reference note
|
||||
in `python.md:359` documenting the rename history).
|
||||
|
||||
- **G6.** New tests in `tests/test_audit_boundary_layer.py` (≥10 tests:
|
||||
finder detects `dict[str, Any]` in return / param / local annotation;
|
||||
allowlist suppresses findings + emits WHITELISTED; `--strict` exits 1 on
|
||||
un-allowlisted site, exits 0 on allowlisted; `--json` output shape; missing
|
||||
file handling; syntax error handling).
|
||||
|
||||
- **G7.** New/updated tests in `tests/test_audit_optional_returns.py`
|
||||
(or update existing test file if one references the old name): ≥5 tests
|
||||
confirming the widened scope, the rename, baseline reading, and
|
||||
`--strict` behavior.
|
||||
|
||||
- **G8.** End-of-track report at
|
||||
`docs/reports/TRACK_COMPLETION_enforcement_gap_closure_20260627.md`
|
||||
documenting what shipped + the residual violation baselines + any
|
||||
contradictions from `CONTRADICTIONS_REPORT_20260627.md` closed (C1, C2,
|
||||
C3-partial, C18-partial, C21) and which remain (C5, C6, C16, C17 — those
|
||||
are docs-sync items deferred until tier2 stabilizes, per user directive
|
||||
2026-06-27).
|
||||
|
||||
## Functional Requirements
|
||||
|
||||
### FR1: `scripts/audit_boundary_layer.py`
|
||||
|
||||
- **CLI contract** mirrors `audit_exception_handling.py` + `audit_imports.py`:
|
||||
- `uv run python scripts/audit_boundary_layer.py` — informational (exits 0)
|
||||
- `uv run python scripts/audit_boundary_layer.py --strict` — exits 1 on
|
||||
any un-allowlisted `dict[str, Any]` signature site
|
||||
- `uv run python scripts/audit_boundary_layer.py --json` — JSON output
|
||||
- `uv run python scripts/audit_boundary_layer.py --show-allowlist` —
|
||||
prints the current allowlist + reasons, exits 0
|
||||
- `uv run python scripts/audit_boundary_layer.py --no-allowlist` —
|
||||
audits all sites regardless of allowlist (for one-off audits)
|
||||
- **Detection contract** — finds `dict[str, Any]` in:
|
||||
- function return annotations (`def f(...) -> dict[str, Any]`)
|
||||
- function parameter annotations (`def f(x: dict[str, Any])`)
|
||||
- annotated assignments to locals at function scope
|
||||
(`acc: dict[str, dict[str, Any]] = {}` — common pattern in vendor adapters)
|
||||
- **Allowlist contract** — reads `scripts/boundary_layer_allowlist.toml`.
|
||||
Per-file entries: `[allowlist."<relative_path>"] reason = "..."`. Within
|
||||
an allowlisted file, ALL `dict[str, Any]` sites are suppressed with a
|
||||
single `WHITELISTED` annotation per file (mirrors `audit_imports.py`
|
||||
precedent; per-line entries would be brittle because the same file has
|
||||
multiple boundary functions). Use `--no-allowlist` to ignore the allowlist.
|
||||
- **Coverage:** all `src/*.py`. The audit does NOT traverse `tests/`,
|
||||
`scripts/`, `simulation/` — those aren't subject to §17.7.
|
||||
- **Defaults:** informational mode prints a summary table (file, sites,
|
||||
allowlisted?) + a list of violations. `--strict` prints the same and
|
||||
exits 1 if there are un-allowlisted sites.
|
||||
- **Source:** 1-space indent, no comments in body, type-hinted, docstrings
|
||||
where the contract is non-obvious. Module docstring explains the §17.7
|
||||
contract + the allowlist pattern.
|
||||
|
||||
### FR2: `scripts/boundary_layer_allowlist.toml`
|
||||
|
||||
- TOML file modeled on `audit_imports_whitelist.toml`:
|
||||
- Header comment block explaining the purpose + the format.
|
||||
- "Last reviewed: 2026-06-27"
|
||||
- `[allowlist."<relative_path>"]` entries for each legitimate boundary
|
||||
file with a `reason` field documenting why it's at the wire boundary.
|
||||
- **Initial contents:** the ~14 legitimate boundary files identified in the
|
||||
Current State Audit (`context_presets.py`, `events.py`,
|
||||
`openai_compatible.py`, `theme_models.py`, `log_registry.py`, `presets.py`,
|
||||
`tool_presets.py`, `personas.py`, `workspace_manager.py`, `paths.py`,
|
||||
`gemini_cli_adapter.py`, `mcp_client.py`, `type_aliases.py`,
|
||||
`session_logger.py`). The two genuine violators (`hot_reloader.py`,
|
||||
`startup_profiler.py`) are NOT in the allowlist — the audit will flag them
|
||||
on master, but `audit_boundary_layer.baseline.json` will record them so
|
||||
`--strict` stays green until a future track migrates them.
|
||||
|
||||
### FR3: Rename + widen `audit_optional_in_3_files.py` → `audit_optional_returns.py`
|
||||
|
||||
- **Rename:** `git mv scripts/audit_optional_in_3_files.py
|
||||
scripts/audit_optional_returns.py` (preserves git history).
|
||||
- **Code changes:**
|
||||
- Module docstring: drop "4 baseline files"; say "all `src/*.py` per
|
||||
§17 post-2026-06-27 widening".
|
||||
- `BASELINE_FILES: tuple[str, ...] = (...)` → `def _discover_src_files() ->
|
||||
list[Path]: return sorted(Path("src").glob("*.py"))` (the precedent is
|
||||
`audit_exception_handling.py`'s glob approach).
|
||||
- `audit_file()` is already generic — no logic change.
|
||||
- Output: the summary line says "scanned N files" with N = the count.
|
||||
- **Baseline file:** create `scripts/audit_optional_returns.baseline.json`
|
||||
recording the 3 `src/history.py` `RETURN_OPTIONAL` violations so
|
||||
`--strict` stays green. The strict-mode behavior: exit 1 if findings >
|
||||
baseline, exit 0 otherwise. (Mirrors `audit_weak_types.py`'s baseline +
|
||||
`--strict` contract — see `audit_weak_types.baseline.json`.)
|
||||
- **Backward-compat:** The old name `audit_optional_in_3_files.py` is gone.
|
||||
Any external references to the old name must be updated. (Per the
|
||||
pre-flight grep, references exist in `python.md:359`, `python.md:452`,
|
||||
and possibly `error_handling.md` — those are doc edits in G5. The
|
||||
`code_path_audit_20260607` track's plan referenced the old name as a
|
||||
cross-reference contract — that's historical; not updated.)
|
||||
|
||||
### FR4: `python.md` §17 enforcement inventory + §17.8 enforcement section
|
||||
|
||||
- **§17 inventory table (lines 449-456)** corrected:
|
||||
- Row for `dict[str, Any]` ban: `audit_weak_types.py` (implemented) +
|
||||
`audit_boundary_layer.py --strict` (implemented this track) — BOTH
|
||||
listed, with the boundary audit's note: "uses
|
||||
`scripts/boundary_layer_allowlist.toml`; use `--no-allowlist` to audit
|
||||
all `src/*.py` without suppression."
|
||||
- Row for `Optional[T]` returns: `audit_optional_returns.py` (renamed +
|
||||
widened to all `src/*.py` this track; reads
|
||||
`audit_optional_returns.baseline.json` for the 3 `history.py` residuals
|
||||
until cruft_elimination Phase 6).
|
||||
- Row for local imports + aliasing + repeated `from_dict()`:
|
||||
`audit_imports.py` — marked "✅ implemented" (CORRECTED from current
|
||||
"⚠️ not yet built").
|
||||
- Row for repeated `.from_dict()`: same as above (covered by
|
||||
`audit_imports.py`).
|
||||
- **§17.8 enforcement section (lines 357-362)** updated:
|
||||
- Bullet for `audit_optional_returns.py` → reflects rename + widening.
|
||||
- Bullet for `audit_imports.py` → marked implemented (drop the parenthetical
|
||||
"planned in §17.9a").
|
||||
- Bullet for "boundary_layer audit (planned...)" → replaced with bullet
|
||||
for `audit_boundary_layer.py --strict` (implemented, references
|
||||
`boundary_layer_allowlist.toml`).
|
||||
- The "Pre-commit: every commit MUST pass all four audits above" line →
|
||||
"five audits" (weak_types, boundary_layer, optional_returns,
|
||||
exception_handling, imports).
|
||||
|
||||
### FR5: Test files
|
||||
|
||||
- **`tests/test_audit_boundary_layer.py`** (NEW) — ≥10 tests:
|
||||
- `test_finder_detects_dict_return_annotation` — synthetic .py with a
|
||||
`def f() -> dict[str, Any]: ...` → finding emitted.
|
||||
- `test_finder_detects_dict_param_annotation` — `def f(x: dict[str, Any])`
|
||||
→ finding emitted.
|
||||
- `test_finder_detects_dict_local_assignment` — `acc: dict[str, Any] = {}`
|
||||
inside a function → finding emitted.
|
||||
- `test_finder_ignores_non_dict_any` — `def f() -> dict[str, int]` → no
|
||||
finding.
|
||||
- `test_allowlist_suppresses_findings` — file in allowlist → findings
|
||||
suppressed, `WHITELISTED` annotation emitted instead.
|
||||
- `test_strict_exits_1_on_violation` — un-allowlisted violation → exit 1.
|
||||
- `test_strict_exits_0_when_allowlisted` — allowlisted file → exit 0.
|
||||
- `test_json_output_shape` — `--json` output has the expected top-level
|
||||
keys (`files_scanned`, `files_with_findings`, `total_findings`,
|
||||
`by_kind`, `findings`).
|
||||
- `test_missing_file_handling` — referenced file absent → graceful
|
||||
`MISSING_FILE` finding, not a crash.
|
||||
- `test_syntax_error_handling` — malformed .py → graceful `SYNTAX_ERROR`
|
||||
finding, not a crash.
|
||||
- `test_show_allowlist_flag` — `--show-allowlist` prints entries, exits 0.
|
||||
- **`tests/test_audit_optional_returns.py`** (NEW) — ≥5 tests:
|
||||
- `test_renamed_script_exists` — `scripts/audit_optional_returns.py`
|
||||
exists; `scripts/audit_optional_in_3_files.py` does NOT.
|
||||
- `test_scans_all_src_files` — audit finds a synthetic `Optional[X]`
|
||||
return in a new file under `src/` that wasn't in the old 4-file
|
||||
baseline. (Use `monkeypatch` to point at a `tmp_path` src/ tree.)
|
||||
- `test_baseline_reading_keeps_strict_green` — with 3 known `history.py`
|
||||
sites baselined, `--strict` exits 0.
|
||||
- `test_strict_exits_1_above_baseline` — add 1 new `Optional[X]` return
|
||||
not in baseline → exit 1.
|
||||
- `test_param_optional_is_warning_not_strict` — `PARAM_OPTIONAL`
|
||||
findings never cause `--strict` to exit 1.
|
||||
|
||||
## Non-Functional Requirements
|
||||
|
||||
- **1-space indentation** for all Python code (hard rule per workflow.md).
|
||||
- **No comments in body** per AGENTS.md "No comments to source code".
|
||||
- **CRLF line endings** preserved on Windows (use `manual-slop_edit_file`
|
||||
MCP tool, not native `edit`, to preserve formatting per workflow.md).
|
||||
- **Atomic per-task commits** — never batch; one task = one commit + one
|
||||
plan/state update commit.
|
||||
- **No diagnostic noise** — no `sys.stderr.write("[FOO] ...")` lines in
|
||||
the audit scripts.
|
||||
- **`--json` mode** produces machine-readable output for CI integration.
|
||||
- **Default mode** is informational (exit 0) per the precedent of every
|
||||
other audit script; `--strict` is the CI gate.
|
||||
- **Performance** — the audit scans all `src/*.py` (~66 files); AST parse
|
||||
+ walk should complete in well under 1 second wall-clock (the existing
|
||||
`audit_weak_types.py` does the same scale and is sub-second).
|
||||
|
||||
## Architecture Reference
|
||||
|
||||
- **`docs/guide_meta_boundary.md`** — the domain-distinction rule; the
|
||||
boundary layer is an Application concept, not a meta-tooling one.
|
||||
- **`docs/reports/boundary_layer_20260628.md`** — the *report* this audit
|
||||
*implements*. Lists every legitimate `Metadata` usage and explains why
|
||||
each is at the wire boundary.
|
||||
- **`conductor/code_styleguides/python.md` §17.7** — the §17.7 contract:
|
||||
"the ONLY place these patterns are allowed is at the literal wire
|
||||
boundary — the function that calls `tomllib.load()`, `json.loads()`, or
|
||||
a vendor SDK's response parser. The boundary is 2-3 functions per file."
|
||||
- **`conductor/code_styleguides/data_oriented_design.md` §8.5** — the
|
||||
Python Type Promotion Mandate (the canonical rule this audit enforces).
|
||||
- **`conductor/code_styleguides/error_handling.md`** — the `Optional[T]`
|
||||
ban (and the `Result[T]` + `NIL_T` replacement pattern).
|
||||
- **`scripts/audit_imports.py` + `scripts/audit_imports_whitelist.toml`** —
|
||||
the precedent template: AST scan + per-file allowlist + `--strict` CI gate
|
||||
+ `--json` / `--show-whitelist` / `--no-whitelist` flags. The new
|
||||
`audit_boundary_layer.py` should match this contract closely.
|
||||
- **`scripts/audit_weak_types.py` + `scripts/audit_weak_types.baseline.json`** —
|
||||
the precedent for the `--strict` baseline-JSOא contract (baseline of known
|
||||
violations; `--strict` exits 1 if current findings exceed baseline). The
|
||||
renamed `audit_optional_returns.py` reuses this pattern for the 3
|
||||
`history.py` residuals.
|
||||
- **`docs/reports/CONTRADICTIONS_REPORT_20260627.md`** — the source of the
|
||||
contradictions this track closes: C1 (audit name vs behavior), C2
|
||||
(Optional ban scope ambiguity), C3 (audit_imports "planned" but actually
|
||||
built), C18 (2/7 vs actually 4/7 patterns audited), C21 (script name).
|
||||
- **`docs/reports/TRACK_COMPLETION_post_module_taxonomy_de_cruft_20260627.md`**
|
||||
— current state of the running parallel track; confirms zero file-overlap.
|
||||
|
||||
## Out of Scope
|
||||
|
||||
- **Fixing the 3 `src/history.py` `Optional[T]` returns.** Those belong to
|
||||
`cruft_elimination_20260627` Phase 6 (the deferred Optional[T]-returns
|
||||
migration work). This track only *baselines* them so the widened strict
|
||||
gate stays green; the actual migration is the future track's job.
|
||||
- **Fixing the 2 `hot_reloader.py` + `startup_profiler.py` `dict[str, Any]`
|
||||
violations.** Same logic: baseline only; a future track migrates them to
|
||||
typed dataclasses (`HotReloadSnapshot`, `ProfilerSnapshot`).
|
||||
- **Docs-count drift in `docs/Readme.md`** (providers 5→8, tests 322→251,
|
||||
commands 50+→33). Per user directive 2026-06-27: wait for tier2 branch
|
||||
to stabilize before touching `docs/Readme.md`.
|
||||
- **Styleguide §10 Anti-OOP self-contradiction (C16)** and
|
||||
**`type_aliases.md` line 19 table (C17)** — both deferred per user
|
||||
directive (they describe code state that only exists post-merge of the
|
||||
tier2 taxonomy branches; fixing them now would make master's docs
|
||||
describe code master doesn't have).
|
||||
- **`RAGChunk.id` field in `guide_rag.md` (C6)** — same branch-sensitivity
|
||||
reason; deferred.
|
||||
- **Building the "repeated `.from_dict()` in same expression" enforcement.**
|
||||
`audit_imports.py` already covers it per §17.9c. No new script needed.
|
||||
- **Building `scripts/audit_optional_returns.py` baseline migration path.**
|
||||
The 3 `history.py` sites are simply added to the initial baseline JSON;
|
||||
no migration script is needed.
|
||||
- **Wire `--strict` mode of `audit_boundary_layer.py` into actual pre-commit
|
||||
hooks in the main repo's `.git/hooks/`.** Per C4 in the contradictions
|
||||
report, pre-commit enforcement is sandbox-only for now; main-repo wiring
|
||||
is a separate track.
|
||||
- **Touching any `src/*.py` source.** This track is pure audit +
|
||||
styleguide + tests. Zero `src/` edits.
|
||||
@@ -0,0 +1,64 @@
|
||||
# Track state for enforcement_gap_closure_20260627
|
||||
# Initialized by Tier 1 Orchestrator on 2026-06-27.
|
||||
# Implementation delegated to Tier 2 (autonomous) or Tier 3 worker dispatch.
|
||||
|
||||
[meta]
|
||||
track_id = "enforcement_gap_closure_20260627"
|
||||
name = "Enforcement Gap Closure (Boundary-Layer Audit + Optional[T] Audit Widening)"
|
||||
status = "active"
|
||||
current_phase = 0 # 0 = pre-Phase 1; bump to 1 when implementation starts
|
||||
last_updated = "2026-06-27"
|
||||
|
||||
[blocked_by]
|
||||
# None. This track is parallel-safe against the running
|
||||
# tier2/post_module_taxonomy_de_cruft_20260627 branch (zero file overlap
|
||||
# verified by Tier 1 against ddcec7b0 + TRACK_COMPLETION file-level changes).
|
||||
|
||||
[blocks]
|
||||
# None. Follow-up tracks (history.py Optional migration, hot_reloader/
|
||||
# startup_profiler dict migration) are documented in metadata.json but not
|
||||
# formally tracked here.
|
||||
|
||||
[phases]
|
||||
# All 4 phases per plan.md. checkpointsha filled when the phase checkpoint
|
||||
# commit is made by the implementing Tier 2/Tier 3.
|
||||
phase_1 = { status = "pending", checkpointsha = "", name = "Boundary-Layer Audit Script (script + allowlist + 10 tests)" }
|
||||
phase_2 = { status = "pending", checkpointsha = "", name = "Optional[T] Audit Rename + Widening (rename + 5 tests + baseline JSON)" }
|
||||
phase_3 = { status = "pending", checkpointsha = "", name = "Styleguide Doc Reconciliation (python.md s17 + cross-ref sweep)" }
|
||||
phase_4 = { status = "pending", checkpointsha = "", name = "End-of-Track Report + State Update + User Sign-off" }
|
||||
|
||||
[tasks]
|
||||
# Phase 1: boundary-layer audit script + allowlist + tests
|
||||
t1_1 = { status = "pending", commit_sha = "", description = "Write 10 failing tests in tests/test_audit_boundary_layer.py (Red phase)" }
|
||||
t1_2 = { status = "pending", commit_sha = "", description = "Implement scripts/audit_boundary_layer.py per spec FR1 (finder + allowlist + strict + json + --show-allowlist + --no-allowlist + --src)" }
|
||||
t1_3 = { status = "pending", commit_sha = "", description = "Write scripts/boundary_layer_allowlist.toml with ~14 boundary files + reasons" }
|
||||
t1_4 = { status = "pending", commit_sha = "", description = "Run tests/test_audit_boundary_layer.py -v (Green phase); verify all 10 pass" }
|
||||
# Phase 2: Optional audit rename + widening
|
||||
t2_1 = { status = "pending", commit_sha = "", description = "Write 5 failing tests in tests/test_audit_optional_returns.py (Red phase)" }
|
||||
t2_2 = { status = "pending", commit_sha = "", description = "git mv audit_optional_in_3_files.py -> audit_optional_returns.py + widen glob to all src/*.py + add --src flag + create audit_optional_returns.baseline.json with 3 history.py residuals" }
|
||||
t2_3 = { status = "pending", commit_sha = "", description = "Run tests/test_audit_optional_returns.py -v (Green phase); verify all 5 pass" }
|
||||
# Phase 3: styleguide doc reconciliation
|
||||
t3_1 = { status = "pending", commit_sha = "", description = "Edit conductor/code_styleguides/python.md s17 inventory table (lines 449-456) + s17.8 enforcement section (lines 357-362) per spec FR4" }
|
||||
t3_2 = { status = "pending", commit_sha = "", description = "Cross-reference sweep for audit_optional_in_3_files.py in conductor/ + docs/ (update enforcement references; preserve historical)" }
|
||||
# Phase 4: end-of-track
|
||||
t4_1 = { status = "pending", commit_sha = "", description = "Run the 7-audit strict suite (verify all pass; the 2 boundary + 3 Optional residuals baselined)" }
|
||||
t4_2 = { status = "pending", commit_sha = "", description = "Write docs/reports/TRACK_COMPLETION_enforcement_gap_closure_20260627.md per spec G8" }
|
||||
t4_3 = { status = "pending", commit_sha = "", description = "Update conductor/tracks.md + conductor/chronology.md + state.toml -> status='completed'" }
|
||||
t4_4 = { status = "pending", commit_sha = "", description = "Conductor - User Manual Verification (PAUSE for user sign-off)" }
|
||||
|
||||
[verification]
|
||||
# Filled as phases complete.
|
||||
phase_1_complete = false
|
||||
phase_2_complete = false
|
||||
phase_3_complete = false
|
||||
phase_4_complete = false
|
||||
all_7_audit_gates_strict_pass = false
|
||||
contradictions_closed_c1_c2_c3_partial_c18_partial_c21 = false
|
||||
|
||||
[scope_summary]
|
||||
# Populated by Tier 1; static scope summary for re-warm after compaction.
|
||||
new_files_count = 7
|
||||
modified_files_count = 5
|
||||
deleted_files_count = 1 # via git mv (audit_optional_in_3_files.py -> audit_optional_returns.py)
|
||||
parallel_safe_against_post_module_taxonomy_de_cruft = true
|
||||
parallel_safety_evidence = "Tier 1 verified zero file overlap against ddcec7b0 + TRACK_COMPLETION_post_module_taxonomy_de_cruft_20260627.md file-level changes table on 2026-06-27"
|
||||
@@ -0,0 +1,52 @@
|
||||
{
|
||||
"track_id": "fix_mma_concurrent_tracks_sim_20260627",
|
||||
"name": "Fix MMA Concurrent Tracks Sim Test (tier-3-live_gui regression)",
|
||||
"status": "active",
|
||||
"type": "fix",
|
||||
"date_created": "2026-06-27",
|
||||
"created_by": "tier2-tech-lead",
|
||||
"blocks": [],
|
||||
"blocked_by": {
|
||||
"post_module_taxonomy_de_cruft_20260627": "shipped (the parent track; this is the followup fix for the 1 remaining tier-3 failure)"
|
||||
},
|
||||
"scope": {
|
||||
"new_files": [
|
||||
"docs/reports/TRACK_COMPLETION_fix_mma_concurrent_tracks_sim_20260627.md"
|
||||
],
|
||||
"modified_files": [
|
||||
"src/app_controller.py",
|
||||
"tests/mock_concurrent_mma.py",
|
||||
"docs/reports/OUTSTANDING_MMA_TEST_FAILURES_20260627.md"
|
||||
],
|
||||
"deleted_files": []
|
||||
},
|
||||
"verification_criteria": [
|
||||
"VC1: tests/test_mma_concurrent_tracks_sim.py::test_mma_concurrent_tracks_execution passes in isolation",
|
||||
"VC2: Tier 3 (tier-3-live_gui) of the batched test suite shows 0 failures",
|
||||
"VC3: No diagnostic stderr lines remain in src/app_controller.py (instrumentation removed)",
|
||||
"VC4: docs/reports/OUTSTANDING_MMA_TEST_FAILURES_20260627.md updated to RESOLVED status",
|
||||
"VC5: docs/reports/TRACK_COMPLETION_fix_mma_concurrent_tracks_sim_20260627.md written",
|
||||
"VC6: No git restore/checkout/reset/stash used during the track (per AGENTS.md HARD BAN)",
|
||||
"VC7: All atomic commits have git notes (per workflow.md Per-Task Commit Protocol)"
|
||||
],
|
||||
"estimated_effort": {
|
||||
"method": "scope (per workflow.md §Tier 1 Track Initialization Rules). NO day estimates.",
|
||||
"scope": "1 task: instrument + diagnose + fix + verify (1 production file + 1 test mock file + 1 report). 3-5 atomic commits."
|
||||
},
|
||||
"risk_register": [
|
||||
"R1 (low): Instrumentation incomplete; failure mode remains hidden - mitigated by adding diagnostics at 3 strategic points (before/after generate_tickets, in except block)",
|
||||
"R2 (medium): Production fix regresses other tests - mitigated by running the targeted tier-3 batched test suite after the fix",
|
||||
"R3 (medium): Mock fix requires deeper understanding of gemini_cli_adapter session reuse - mitigated by reading src/ai_client.py to understand session_id lifecycle",
|
||||
"R4 (low): 30-second test poll may be too short for test infrastructure - mitigated by not changing the poll time; the fix should make the test pass within the existing budget",
|
||||
"R5 (low): Instrumentation leaks into production - mitigated by removing the instrumentation in the same commit that fixes the bug (or follow-up commit)",
|
||||
"R6 (medium): User does not give permission to run the full 11-tier batch - mitigated by running only the targeted tier-3 batch (--tier tier-3-live_gui); ask user for full batch separately"
|
||||
],
|
||||
"out_of_scope": [
|
||||
"Refactoring src/multi_agent_conductor.py (the MMA engine itself)",
|
||||
"Refactoring _cb_accept_tracks or _start_track_logic beyond the minimum fix",
|
||||
"Refactoring tests/mock_concurrent_mma.py beyond the minimum fix",
|
||||
"Adding new MMA concurrent execution tests",
|
||||
"Fixing any other tier failures (RAG flake is pre-existing and out of scope)",
|
||||
"Updating conductor/tracks/post_module_taxonomy_de_cruft_20260627/spec.md (the parent track is SHIPPED)"
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,163 @@
|
||||
# Plan: fix_mma_concurrent_tracks_sim_20260627
|
||||
|
||||
3 phases, 4 tasks, 3-5 atomic commits. Per-task TDD red-first. The "test" is the existing failing test in `tests/test_mma_concurrent_tracks_sim.py`; the "fix" is the production code in `src/app_controller.py` and the mock in `tests/mock_concurrent_mma.py`.
|
||||
|
||||
## Phase 0: Instrument + diagnose (Tier 2, 1 commit)
|
||||
|
||||
**Focus:** Per workflow.md "The Deduction Loop (kill it)", you are allowed to run a failing test at most 2 times in a single investigation. After 2 failures, STOP running the test. Read the code, predict the failure mode, and instrument ALL the relevant state in one pass. So Phase 0 is the instrumentation pass.
|
||||
|
||||
- [ ] **Task 0.1** [Tier 2]: Add stderr diagnostics to `src/app_controller.py:_start_track_logic_result`
|
||||
- WHERE: `src/app_controller.py:4750-4840` (the `_start_track_logic_result` function)
|
||||
- WHAT: Add 3 stderr write/flush calls:
|
||||
1. BEFORE `conductor_tech_lead.generate_tickets(goal, skeletons)` — log title, goal
|
||||
2. AFTER `generate_tickets` returns — log length of `raw_tickets`
|
||||
3. INSIDE the `except` block at line 4831 — log full traceback via `import traceback; traceback.print_exc()`
|
||||
- HOW: `manual-slop_edit_file` surgical edit (3-10 lines per edit)
|
||||
- SAFETY: `uv run -m pytest tests/test_mma_concurrent_tracks_sim.py -v` still parses (py_check_syntax exits 0)
|
||||
- INSTRUMENTATION LIFETIME: This commit is INTERIM. The instrumentation must be removed in Phase 2 once the root cause is identified. (Per AGENTS.md "No Diagnostic Noise in Production".)
|
||||
- [ ] **COMMIT 0.1:** `chore(diag): add stderr instrumentation to _start_track_logic_result` (Tier 2)
|
||||
- [ ] **GIT NOTE:** "Temporary instrumentation to diagnose test_mma_concurrent_tracks_execution failure. Will be removed in the next commit after root cause is identified."
|
||||
|
||||
- [ ] **Task 0.2** [Tier 2]: Run the test in isolation with the instrumentation
|
||||
- HOW: `uv run -m pytest tests/test_mma_concurrent_tracks_sim.py::test_mma_concurrent_tracks_execution -v -s > tests/artifacts/tier2_state/fix_mma_concurrent_tracks_sim_20260627/test_run_0.log 2>&1`
|
||||
- Per workflow.md: redirect to log file (NEVER filter output, NEVER use `head`/`tail`)
|
||||
- Read the log file: `manual-slop_read_file tests/artifacts/tier2_state/fix_mma_concurrent_tracks_sim_20260627/test_run_0.log`
|
||||
- Identify the failure mode for the 2nd track
|
||||
- **DO NOT** run the test more than 2 times in total (workflow.md "Deduction Loop")
|
||||
|
||||
## Phase 1: Fix the root cause (Tier 3, 1-2 commits)
|
||||
|
||||
**Focus:** Based on Phase 0 diagnosis, fix the actual root cause.
|
||||
|
||||
- [ ] **Task 1.1** [Tier 3]: Fix the root cause in `src/app_controller.py` OR `tests/mock_concurrent_mma.py`
|
||||
- **If Phase 0 diagnosis is "mock routing broken for 2nd call"** (cause A in spec):
|
||||
- WHERE: `tests/mock_concurrent_mma.py` (the routing logic at lines 64-90)
|
||||
- WHAT: The `gemini_cli_adapter` reuses the session_id returned by the previous call. So track-b's call comes in with `--resume mock-sprint-A` (the session_id returned by the previous track's sprint call). The mock must handle this case.
|
||||
- HOW: Add a routing case for `if session_id == "mock-sprint-A" and call_n == N: _emit_sprint_ticket("B")` — but ALSO handle the case where the gemini_cli_adapter passes the latest session_id for both the track-b sprint call and the track-b worker call.
|
||||
- The cleanest fix: don't rely on session_id alone. After epic + sprint-A, the next call is ALWAYS track-b sprint (since we only have 2 tracks). Add a per-call counter that maps to (call_n // 2) % 2 for the track index.
|
||||
- **If Phase 0 diagnosis is "production bug" (cause B/C/D in spec):**
|
||||
- WHERE: `src/app_controller.py:_start_track_logic_result` (line 4750-4840)
|
||||
- WHAT: Fix the specific bug (disk I/O, flat dict missing field, silent exception)
|
||||
- HOW: Surgical `manual-slop_edit_file` fix
|
||||
- SAFETY: `uv run -m pytest tests/test_mma_concurrent_tracks_sim.py -v` shows PASS
|
||||
- [ ] **COMMIT 1.1:** `fix(mma_concurrent): fix 2nd track _start_track_logic not firing` (Tier 3)
|
||||
- Commit message body: explain which root cause was identified and what was changed.
|
||||
- [ ] **GIT NOTE:** "Fixes test_mma_concurrent_tracks_execution by <specific fix>."
|
||||
|
||||
- [ ] **Task 1.2** [Tier 2]: Run the test in isolation to verify the fix
|
||||
- HOW: `uv run -m pytest tests/test_mma_concurrent_tracks_sim.py::test_mma_concurrent_tracks_execution -v > tests/artifacts/tier2_state/fix_mma_concurrent_tracks_sim_20260627/test_run_1.log 2>&1`
|
||||
- Read the log file and verify PASS
|
||||
- If still failing, **STOP and report to the user** (per workflow.md "Surrender" anti-pattern is OK only after the 5-step checklist)
|
||||
|
||||
- [ ] **Task 1.3** [Tier 2]: Run the targeted tier-3 batched test suite to verify no regressions
|
||||
- HOW: `uv run python scripts/run_tests_batched.py --tier tier-3-live_gui > tests/artifacts/tier2_state/fix_mma_concurrent_tracks_sim_20260627/test_run_tier3.log 2>&1`
|
||||
- Verify: 0 failures in tier-3
|
||||
- Per workflow.md "Isolated-Pass Verification Fallacy" — the only verification that matters is the batched run, not the isolated run
|
||||
|
||||
## Phase 2: Remove instrumentation + write report (Tier 2, 1-2 commits)
|
||||
|
||||
**Focus:** Clean up the temporary instrumentation and write the end-of-track report.
|
||||
|
||||
- [ ] **Task 2.1** [Tier 2]: Remove the stderr instrumentation from `src/app_controller.py:_start_track_logic_result`
|
||||
- WHERE: `src/app_controller.py:4750-4840` (where the 3 stderr lines were added in Phase 0)
|
||||
- WHAT: Remove the 3 stderr write/flush calls
|
||||
- HOW: `manual-slop_edit_file` surgical edit (3 sites)
|
||||
- SAFETY: `git grep "_start_track_logic_result.*stderr" src/app_controller.py` returns 0 hits
|
||||
- [ ] **COMMIT 2.1:** `chore(cleanup): remove diagnostic instrumentation from _start_track_logic_result` (Tier 2)
|
||||
- [ ] **GIT NOTE:** "Removes the temporary stderr instrumentation added in 0.1. The bug fix is in 1.1; this is cleanup."
|
||||
|
||||
- [ ] **Task 2.2** [Tier 2]: Update `docs/reports/OUTSTANDING_MMA_TEST_FAILURES_20260627.md` to RESOLVED
|
||||
- WHERE: `docs/reports/OUTSTANDING_MMA_TEST_FAILURES_20260627.md` (the "4. UNRESOLVED" section)
|
||||
- WHAT: Replace "⚠️ UNRESOLVED" with "✅ RESOLVED" and add a link to the fixing commit
|
||||
- HOW: `manual-slop_edit_file` surgical edit
|
||||
- [ ] **COMMIT 2.2:** `docs(report): mark OUTSTANDING_MMA_TEST_FAILURES_20260627.md as RESOLVED` (Tier 2)
|
||||
- [ ] **GIT NOTE:** "Per FR8 of the track spec. The MMA concurrent tracks test is now passing in the batched test suite."
|
||||
|
||||
- [ ] **Task 2.3** [Tier 2]: Write `docs/reports/TRACK_COMPLETION_fix_mma_concurrent_tracks_sim_20260627.md`
|
||||
- WHERE: `docs/reports/TRACK_COMPLETION_fix_mma_concurrent_tracks_sim_20260627.md` (new file)
|
||||
- WHAT: Follow the precedent of `TRACK_COMPLETION_post_module_taxonomy_de_cruft_20260627.md`:
|
||||
- Executive summary
|
||||
- 3 root causes already fixed in 635ca552
|
||||
- The 1 root cause fixed in this track
|
||||
- Files changed
|
||||
- Verification results
|
||||
- Suggested next steps
|
||||
- HOW: `Write` tool to create the file
|
||||
- [ ] **COMMIT 2.3:** `docs(reports): TRACK_COMPLETION_fix_mma_concurrent_tracks_sim_20260627` (Tier 2)
|
||||
- [ ] **GIT NOTE:** "End-of-track report. Track is complete; tier-3 of post_module_taxonomy_de_cruft_20260627 is now PASS."
|
||||
|
||||
- [ ] **Task 2.4** [Tier 2]: Update `conductor/tracks/fix_mma_concurrent_tracks_sim_20260627/state.toml` to status = "completed"
|
||||
- WHERE: `conductor/tracks/fix_mma_concurrent_tracks_sim_20260627/state.toml`
|
||||
- WHAT: Set `[meta].status = "completed"`, `[meta].current_phase = "complete"`, fill in task commit SHAs
|
||||
- HOW: `Write` tool
|
||||
- [ ] **COMMIT 2.4:** `conductor(state): fix_mma_concurrent_tracks_sim_20260627 SHIPPED` (Tier 2)
|
||||
- [ ] **GIT NOTE:** "Track SHIPPED. All 7 VCs pass. Tier-3 of the parent track is now PASS."
|
||||
|
||||
## Commit Log (Expected, 4-6 atomic commits)
|
||||
|
||||
1. (Phase 0) `chore(diag): add stderr instrumentation to _start_track_logic_result` (Tier 2)
|
||||
2. (Phase 1) `fix(mma_concurrent): fix 2nd track _start_track_logic not firing` (Tier 3)
|
||||
3. (Phase 2) `chore(cleanup): remove diagnostic instrumentation from _start_track_logic_result` (Tier 2)
|
||||
4. (Phase 2) `docs(report): mark OUTSTANDING_MMA_TEST_FAILURES_20260627.md as RESOLVED` (Tier 2)
|
||||
5. (Phase 2) `docs(reports): TRACK_COMPLETION_fix_mma_concurrent_tracks_sim_20260627` (Tier 2)
|
||||
6. (Phase 2) `conductor(state): fix_mma_concurrent_tracks_sim_20260627 SHIPPED` (Tier 2)
|
||||
|
||||
Plus per-task plan-update commits per workflow.md.
|
||||
|
||||
## Verification Commands
|
||||
|
||||
```bash
|
||||
# Phase 0: Run the test in isolation with instrumentation
|
||||
uv run -m pytest tests/test_mma_concurrent_tracks_sim.py::test_mma_concurrent_tracks_execution -v -s > tests/artifacts/tier2_state/fix_mma_concurrent_tracks_sim_20260627/test_run_0.log 2>&1
|
||||
|
||||
# Phase 1: Run the test in isolation after the fix
|
||||
uv run -m pytest tests/test_mma_concurrent_tracks_sim.py::test_mma_concurrent_tracks_execution -v > tests/artifacts/tier2_state/fix_mma_concurrent_tracks_sim_20260627/test_run_1.log 2>&1
|
||||
|
||||
# Phase 1: Run the targeted tier-3 batched suite
|
||||
uv run python scripts/run_tests_batched.py --tier tier-3-live_gui > tests/artifacts/tier2_state/fix_mma_concurrent_tracks_sim_20260627/test_run_tier3.log 2>&1
|
||||
|
||||
# Phase 2 (optional, ASK USER FIRST per user directive): Run the full 11-tier batch
|
||||
uv run python scripts/run_tests_batched.py > tests/artifacts/tier2_state/fix_mma_concurrent_tracks_sim_20260627/test_run_full.log 2>&1
|
||||
|
||||
# Verify VC3: No diagnostic lines in production
|
||||
git grep "_start_track_logic_result.*stderr" src/app_controller.py
|
||||
# Expect: 0 hits
|
||||
|
||||
# Verify VC4: Report is updated
|
||||
grep "RESOLVED" docs/reports/OUTSTANDING_MMA_TEST_FAILURES_20260627.md
|
||||
# Expect: 1+ hits
|
||||
|
||||
# Verify VC5: TRACK_COMPLETION exists
|
||||
ls docs/reports/TRACK_COMPLETION_fix_mma_concurrent_tracks_sim_20260627.md
|
||||
# Expect: file exists
|
||||
```
|
||||
|
||||
## Notes for Tier 3 worker (Phase 1)
|
||||
|
||||
- The "test" is `tests/test_mma_concurrent_tracks_sim.py::test_mma_concurrent_tracks_execution`. It is the spec.
|
||||
- The fix is in `src/app_controller.py:_start_track_logic_result` OR `tests/mock_concurrent_mma.py`. Choose based on Phase 0 diagnosis.
|
||||
- Use `manual-slop_edit_file` for surgical edits (3-10 lines per edit).
|
||||
- 1-space indentation. CRLF line endings. No comments.
|
||||
- Per `conductor/code_styleguides/python.md` §17: no `dict[str, Any]`, no `Any`, no `Optional[T]`, no `hasattr()` for entity dispatch.
|
||||
- If the fix requires changing the mock's response shape, do NOT change the test — the test exercises the production pipeline.
|
||||
|
||||
## Notes for Tier 2 reviewer (Phases 0 and 2)
|
||||
|
||||
- Phase 0 is the instrumentation pass. The diagnostics are INTERIM and must be removed in Phase 2.
|
||||
- Phase 1 is the fix. Read the test log from Phase 0 BEFORE choosing the fix; don't guess.
|
||||
- Phase 2 is cleanup + report.
|
||||
- Per `AGENTS.md` HARD BAN: no `git restore`, no `git checkout`, no `git reset`, no `git stash`.
|
||||
- Per `AGENTS.md` "No Diagnostic Noise in Production": the instrumentation in Phase 0 must be removed in Phase 2.
|
||||
- Per `conductor/workflow.md` "Pre-commit verification gate": after every commit, run `git diff --cached --stat` + `git show HEAD --stat` + `uv run python scripts/audit_tier2_leaks.py --strict`.
|
||||
|
||||
## See also
|
||||
|
||||
- `conductor/tracks/fix_mma_concurrent_tracks_sim_20260627/spec.md` — the canonical reference
|
||||
- `docs/reports/OUTSTANDING_MMA_TEST_FAILURES_20260627.md` — the 4 stacked root causes
|
||||
- `conductor/tracks/post_module_taxonomy_de_cruft_20260627/spec.md` — the parent track spec
|
||||
- `conductor/tracks/post_module_taxonomy_de_cruft_20260627/state.toml` — the parent track state
|
||||
- `conductor/code_styleguides/error_handling.md` — the Result[T] + nil-sentinel convention
|
||||
- `conductor/code_styleguides/data_oriented_design.md` §8.5 — the Python Type Promotion Mandate
|
||||
- `conductor/code_styleguides/python.md` §17 — the LLM Default Anti-Patterns
|
||||
- `conductor/workflow.md` §"Process Anti-Patterns" — the 8 anti-patterns to avoid
|
||||
- `AGENTS.md` — the project operating rules + HARD BANs
|
||||
@@ -0,0 +1,207 @@
|
||||
# Track Specification: fix_mma_concurrent_tracks_sim_20260627
|
||||
|
||||
## Overview
|
||||
|
||||
Single-test fix track. The `tier-3-live_gui::test_mma_concurrent_tracks_sim::test_mma_concurrent_tracks_execution` test was failing on the `tier2/post_module_taxonomy_de_cruft_20260627` branch. Per the user directive ("those issues must get resolved we are not sweeping them under the rug"), this track fixes the test to pass in the batched test suite, ships it, and the parent branch is then ready for review.
|
||||
|
||||
The test exercises the full concurrent-MMA flow: plan an epic (returns 2 proposed tracks), accept both, start both concurrently, verify both ticket-A and ticket-B workers appear, verify both tracks complete. The failure was at "accept-tracks" — after `btn_mma_accept_tracks`, only 1 of the 2 proposed tracks was created in the project.
|
||||
|
||||
This track is the **TDD fix for one specific test**. It is NOT a sweep or a refactor; it is a focused investigation + fix + verification.
|
||||
|
||||
## Current State Audit (branch `tier2/post_module_taxonomy_de_cruft_20260627`, measured 2026-06-27)
|
||||
|
||||
| Component | State | Source |
|
||||
|---|---|---|
|
||||
| `tests/test_mma_concurrent_tracks_sim.py` | 144 lines; fails at line 66 ("Tracks not created in project") | `manual-slop_read_file` |
|
||||
| `tests/mock_concurrent_mma.py` | 144 lines; uses file-based call counter; parses `--resume` arg | commit 635ca552 |
|
||||
| `src/app_controller.py:_cb_accept_tracks._bg_task` | Loops `for i, track_data in enumerate(self.proposed_tracks): self._start_track_logic(...)`; only track-a's mock call observed | `manual-slop_get_file_slice` lines 4665-4680 |
|
||||
| `src/app_controller.py:_start_track_logic_result` | Calls `conductor_tech_lead.generate_tickets(goal, skeletons)` → mock returns sprint ticket → `project_manager.save_track_state(track_id, state, ...)` → `self.tracks.append(...)` | `manual-slop_get_file_slice` lines 4750-4840 |
|
||||
| 3 production sites fixed in 635ca552 | `flat.setdefault(...)["paths"] = ...` → `flat.to_dict() then setdefault`; `t_data["id"]` → `t_data.id` | `OUTSTANDING_MMA_TEST_FAILURES_20260627.md` |
|
||||
| 1 test mock fix in 635ca552 | `--resume` arg parsing + call counter | commit 635ca552 |
|
||||
|
||||
## The 4 Stacked Regressions (Root Cause Analysis)
|
||||
|
||||
### 1. `flat_config()` return type change (PRODUCTION BUG — FIXED in 635ca552)
|
||||
|
||||
`flat_config()` in `src/project.py` was changed by `cruft_elimination_20260627` (commit 0d2a9b5e) from `dict[str, Any]` to a **frozen `@dataclass ProjectContext`**. The change was semantic, not just cosmetic. But 3 sites in `src/app_controller.py` mutated the returned object:
|
||||
|
||||
- `_do_generate` (line 4027): `flat["files"] = ...; flat["files"]["paths"] = ...`
|
||||
- `_cb_plan_epic` (line 4604): `flat.setdefault("files", {})["paths"] = ...`
|
||||
- `_start_track_logic_result` (line 4793): `flat.setdefault("files", {})["paths"] = ...`
|
||||
|
||||
Each raised `TypeError: 'ProjectContext' object does not support item assignment`.
|
||||
|
||||
**Fix in 635ca552:** Call `flat.to_dict()` to get a mutable dict.
|
||||
|
||||
### 2. `topological_sort()` return type change (PRODUCTION BUG — FIXED in 635ca552)
|
||||
|
||||
`conductor_tech_lead.topological_sort()` in `src/mma_conductor.py` was changed (also in commit 0d2a9b5e) from `list[str]` to `list[Ticket]`. The `_start_track_logic_result` consumer used dict-style access (`t_data["id"]`, `t_data.get("description")`).
|
||||
|
||||
**Fix in 635ca552:** Use Ticket attribute access (`t_data.id`, `t_data.description`, etc.).
|
||||
|
||||
### 3. `gemini_cli_adapter` `--resume` session reuse (MOCK BUG — FIXED in 635ca552)
|
||||
|
||||
The gemini_cli_adapter now reuses the session_id from the epic call (`mock-epic`) for all subsequent Tier 2/3 calls via `--resume mock-epic`. The original mock `tests/mock_concurrent_mma.py` was written when each LLM call was stateless; it routed on prompt substrings ("PATH: Epic Initialization", "generate the implementation tickets", "You are assigned to Ticket"). In resume mode the prompt is empty (the session is the context), so the routing fell to the default case.
|
||||
|
||||
**Fix in 635ca552:** Parse `--resume` from `sys.argv` and use a persistent file-based call counter to route to per-track responses.
|
||||
|
||||
### 4. ⚠️ UNRESOLVED — 2nd track's `_start_track_logic` never fires
|
||||
|
||||
After fixes 1-3, the test still fails: only 1 sprint-ticket mock call is observed (for track-a); the 2nd call for track-b never happens. The 30-second test poll times out.
|
||||
|
||||
**Hypothesized root cause:** `_start_track_logic` for track-a either hangs OR fails silently. The for loop in `_cb_accept_tracks._bg_task` continues to track-b which also calls `_start_track_logic` and also fails/hangs. The test poll times out before either track completes.
|
||||
|
||||
**Possible causes to investigate:**
|
||||
- `conductor_tech_lead.generate_tickets(goal, skeletons)` returns `[]` (no tickets) for track-a when the adapter can't reuse the session properly → no track created, no error
|
||||
- `project_manager.save_track_state(track_id, state, ...)` blocks on disk I/O
|
||||
- The IO pool is saturated (the bg_task is `submit_io(_bg_task)` and each `_start_track_logic` is synchronous on its own thread)
|
||||
- `aggregate.run(flat)` hangs (the new `flat.to_dict()` conversion may be missing a field that `aggregate.run` requires)
|
||||
- The exception in `except (OSError, IOError, ValueError, TypeError, KeyError, AttributeError, RuntimeError) as e:` at line 4831 catches an exception and returns `Result(data=None, errors=[err])` — but the caller `_start_track_logic` (line 4744) prints `ERROR in _start_track_logic: {err.message}` and continues to the next track in the loop, which also fails. The test poll times out because no track is appended to `self.tracks`.
|
||||
|
||||
## Goals
|
||||
|
||||
| ID | Goal | Acceptance |
|
||||
|---|---|---|
|
||||
| G1 | Diagnose why only 1 of 2 tracks is created in `_cb_accept_tracks._bg_task` | stderr diagnostics + log file show the actual failure mode for each track |
|
||||
| G2 | Fix the production OR test-mock bug that causes the 2nd track to fail | Test passes in isolation AND in the full batched suite |
|
||||
| G3 | Update `docs/reports/OUTSTANDING_MMA_TEST_FAILURES_20260627.md` to reflect the fix | Report shows RESOLVED status |
|
||||
| G4 | Tier 3 of `tier2/post_module_taxonomy_de_cruft_20260627` goes from FAIL to PASS | `uv run python scripts/run_tests_batched.py --tier tier-3-live_gui` shows 0 failures |
|
||||
| G5 | All 11 batched test tiers pass | `uv run python scripts/run_tests_batched.py` shows 11/11 PASS (or pre-existing RAG flake) |
|
||||
|
||||
## Non-Goals
|
||||
|
||||
- Refactoring the MMA concurrent execution engine (`src/multi_agent_conductor.py`)
|
||||
- Refactoring `_cb_accept_tracks` or `_start_track_logic` beyond the minimum fix
|
||||
- Refactoring `tests/mock_concurrent_mma.py` beyond the minimum fix
|
||||
- Adding new tests for MMA concurrent execution
|
||||
- Fixing any other tier failures (RAG flake is pre-existing and out of scope)
|
||||
- Updating `conductor/tracks/post_module_taxonomy_de_cruft_20260627/spec.md` (the parent track is SHIPPED; this is a follow-up)
|
||||
|
||||
## Functional Requirements
|
||||
|
||||
### FR1: Instrument `_start_track_logic_result` with stderr diagnostics (Tier 3)
|
||||
|
||||
Add 3 `sys.stderr.write` + `sys.stderr.flush` calls:
|
||||
1. BEFORE `conductor_tech_lead.generate_tickets(goal, skeletons)` — log title, goal
|
||||
2. AFTER `generate_tickets` returns — log length of `raw_tickets`
|
||||
3. INSIDE the `except` block at line 4831 — log full traceback via `import traceback; traceback.print_exc()`
|
||||
|
||||
**WHY:** Per workflow.md "The Deduction Loop (kill it)", you are allowed to run a failing test at most 2 times in a single investigation. After 2 failures, STOP running the test. Read the code, predict the failure mode, and instrument ALL the relevant state in one pass.
|
||||
|
||||
### FR2: Run the test in isolation (Tier 2)
|
||||
|
||||
`uv run -m pytest tests/test_mma_concurrent_tracks_sim.py::test_mma_concurrent_tracks_execution -v -s` and capture:
|
||||
- stderr output from `_start_track_logic_result` instrumentation
|
||||
- the mock call counter file at `artifacts/.mock_concurrent_mma_call_count`
|
||||
- the sloppy.py stderr (via the test's log capture)
|
||||
|
||||
**Per workflow.md "Pre-commit verification gate"**, redirect to log file: `... > tests/artifacts/tier2_state/fix_mma_concurrent_tracks_sim_20260627/test_run.log 2>&1`
|
||||
|
||||
### FR3: Diagnose the failure mode (Tier 2)
|
||||
|
||||
Based on FR2 output, identify ONE of:
|
||||
- A. `generate_tickets` returns `[]` (mock routing broken for 2nd call)
|
||||
- B. `project_manager.save_track_state` raises (disk I/O issue)
|
||||
- C. `aggregate.run(flat)` raises (flat dict missing field)
|
||||
- D. The `except` block catches a `RuntimeError` (or other) and the test poll times out
|
||||
|
||||
### FR4: Fix the root cause (Tier 3)
|
||||
|
||||
**Per the user directive: "we should adjust the tests instead"** — but the test exercises the production code path. The test is the spec; the production must be correct. Fix in this priority order:
|
||||
|
||||
1. **If cause A** (mock routing): fix `tests/mock_concurrent_mma.py` to handle the `--resume mock-sprint-A` session reuse (the adapter reuses the session_id returned by the previous call, so track-b's call is `--resume mock-sprint-A` not `--resume mock-epic`).
|
||||
2. **If cause B/C/D** (production bug): fix `src/app_controller.py:_start_track_logic_result` to handle the error gracefully, log the error to the test log, and continue to the next track (instead of silently aborting the loop).
|
||||
|
||||
### FR5: Verify the test passes in isolation (Tier 2)
|
||||
|
||||
`uv run -m pytest tests/test_mma_concurrent_tracks_sim.py::test_mma_concurrent_tracks_execution -v`
|
||||
|
||||
Must show PASS.
|
||||
|
||||
### FR6: Verify the test passes in the full batched suite (Tier 2)
|
||||
|
||||
**Per workflow.md "Isolated-Pass Verification Fallacy"** — the only verification that matters for `live_gui` tests is the batch run. The test must pass with the other tier-3 tests in the suite.
|
||||
|
||||
`uv run python scripts/run_tests_batched.py --tier tier-3-live_gui`
|
||||
|
||||
Must show 0 failures in tier-3.
|
||||
|
||||
### FR7: Verify all 11 tiers pass (Tier 2)
|
||||
|
||||
`uv run python scripts/run_tests_batched.py`
|
||||
|
||||
**Per user directive ("stop running the batch yourself, ask me")** — ASK the user before running the full 11-tier batch. Show them the targeted tier-3 result first.
|
||||
|
||||
Expected: 11/11 PASS (or 10/11 if the RAG flake is the only remaining failure).
|
||||
|
||||
### FR8: Update `OUTSTANDING_MMA_TEST_FAILURES_20260627.md` (Tier 2)
|
||||
|
||||
Mark the section "4. UNRESOLVED — Second track's `_start_track_logic` never fires" as RESOLVED with a link to the fixing commit.
|
||||
|
||||
### FR9: Write `TRACK_COMPLETION_fix_mma_concurrent_tracks_sim_20260627.md` (Tier 2)
|
||||
|
||||
Follow the precedent of `TRACK_COMPLETION_post_module_taxonomy_de_cruft_20260627.md`:
|
||||
- Executive summary
|
||||
- 3 root causes fixed (the 3 already in 635ca552)
|
||||
- The 1 root cause fixed in this track
|
||||
- Files changed
|
||||
- Verification results
|
||||
- Suggested next steps
|
||||
|
||||
## Non-Functional Requirements
|
||||
|
||||
- NFR1: 1-space indentation
|
||||
- NFR2: CRLF line endings on Windows
|
||||
- NFR3: No comments in source code
|
||||
- NFR4: Per-task atomic commits with git notes
|
||||
- NFR5: No new pip dependencies
|
||||
- NFR6: Result[T] returns for fallible fns
|
||||
- NFR7: No `git restore` / `git checkout` / `git reset` / `git stash` (per AGENTS.md HARD BAN)
|
||||
- NFR8: Stderr diagnostics must be removed before the final commit (no diagnostic noise in production per workflow.md)
|
||||
|
||||
## Architecture Reference
|
||||
|
||||
- `src/app_controller.py:_cb_accept_tracks._bg_task` (line 4635-4682) — the for loop that should create 2 tracks
|
||||
- `src/app_controller.py:_start_track_logic_result` (line 4750-4840) — the per-track pipeline
|
||||
- `src/multi_agent_conductor.py:ConductorEngine.run` — the engine that spawns workers
|
||||
- `src/ai_client.py:gemini_cli_adapter` (or similar) — the adapter that uses `--resume` for session reuse
|
||||
- `src/mma_conductor.py:topological_sort` — returns `list[Ticket]` (was `list[str]` pre-cruft)
|
||||
- `src/project.py:flat_config` — returns `frozen @dataclass ProjectContext` (was `dict[str, Any]` pre-cruft)
|
||||
- `conductor/code_styleguides/error_handling.md` — the Result[T] + nil-sentinel convention
|
||||
- `conductor/code_styleguides/data_oriented_design.md` §8.5 — the Python Type Promotion Mandate
|
||||
- `conductor/code_styleguides/python.md` §17 — the LLM Default Anti-Patterns
|
||||
|
||||
## Risks
|
||||
|
||||
| # | Risk | Likelihood | Mitigation |
|
||||
|---|---|---|---|
|
||||
| R1 | The instrumentation is incomplete and the failure mode remains hidden | low | Add diagnostics at 3 strategic points: before/after generate_tickets, in the except block |
|
||||
| R2 | The fix requires changes to the production code that may regress other tests | medium | Run the full batched test suite after the fix (with user permission) |
|
||||
| R3 | The mock fix requires a deeper understanding of the gemini_cli_adapter's session reuse | medium | Read `src/ai_client.py:gemini_cli_adapter` (or similar) to understand the session_id lifecycle |
|
||||
| R4 | The test has a 30-second poll that may be too short for the test infrastructure (IO pool + bg_task + subprocess spawn) | low | Document the timing in the test, but don't change the test's poll time (the fix should make the test pass within the existing poll budget) |
|
||||
| R5 | The instrumentation leaks into production (per AGENTS.md "No Diagnostic Noise in Production") | low | Remove the instrumentation in the same commit that fixes the bug (or in a follow-up commit) |
|
||||
| R6 | The user does not give permission to run the full 11-tier batched test suite | medium | Run only the targeted tier-3 batched test (`--tier tier-3-live_gui`); ask user for the full batch separately |
|
||||
|
||||
## Verification Criteria (Definition of Done)
|
||||
|
||||
| # | Criterion | Verification |
|
||||
|---|---|---|
|
||||
| VC1 | The test `test_mma_concurrent_tracks_execution` passes in isolation | `uv run -m pytest tests/test_mma_concurrent_tracks_sim.py -v` shows PASS |
|
||||
| VC2 | Tier 3 of the batched test suite passes (0 failures) | `uv run python scripts/run_tests_batched.py --tier tier-3-live_gui` shows 0 failures |
|
||||
| VC3 | The instrumentation is removed from `src/app_controller.py` | `git grep "_start_track_logic_result.*stderr" src/app_controller.py` returns 0 hits |
|
||||
| VC4 | `OUTSTANDING_MMA_TEST_FAILURES_20260627.md` is updated to RESOLVED | grep "RESOLVED" OUTSTANDING_MMA_TEST_FAILURES_20260627.md returns hits |
|
||||
| VC5 | `TRACK_COMPLETION_fix_mma_concurrent_tracks_sim_20260627.md` is written | `ls docs/reports/TRACK_COMPLETION_fix_mma_concurrent_tracks_sim_20260627.md` exists |
|
||||
| VC6 | All diagnostic stderr lines are removed from `src/app_controller.py` | No `[DEBUG] _start_track_logic:` lines remain in production |
|
||||
| VC7 | No `git restore` / `git checkout` / `git reset` / `git stash` used | Audit the git reflog for the branch |
|
||||
|
||||
## See also
|
||||
|
||||
- `docs/reports/OUTSTANDING_MMA_TEST_FAILURES_20260627.md` — the 4 stacked root causes (this track fixes the 4th)
|
||||
- `docs/reports/END_OF_SESSION_post_module_taxonomy_de_cruft_20260627_iteration3.md` — the prior iteration report
|
||||
- `conductor/tracks/post_module_taxonomy_de_cruft_20260627/spec.md` — the parent track spec
|
||||
- `conductor/tracks/post_module_taxonomy_de_cruft_20260627/state.toml` — the parent track state
|
||||
- `conductor/code_styleguides/error_handling.md` — the Result[T] + nil-sentinel convention
|
||||
- `conductor/code_styleguides/data_oriented_design.md` §8.5 — the Python Type Promotion Mandate
|
||||
- `conductor/code_styleguides/python.md` §17 — the LLM Default Anti-Patterns
|
||||
- `conductor/workflow.md` §"Process Anti-Patterns" — the 8 anti-patterns to avoid
|
||||
- `AGENTS.md` — the project operating rules + HARD BANs
|
||||
@@ -0,0 +1,73 @@
|
||||
# Track state for fix_mma_concurrent_tracks_sim_20260627
|
||||
# Updated by Tier 2 Tech Lead as tasks complete
|
||||
|
||||
[meta]
|
||||
track_id = "fix_mma_concurrent_tracks_sim_20260627"
|
||||
name = "Fix MMA Concurrent Tracks Sim Test (tier-3-live_gui regression)"
|
||||
status = "active"
|
||||
current_phase = 1
|
||||
last_updated = "2026-06-27"
|
||||
|
||||
[blocked_by]
|
||||
post_module_taxonomy_de_cruft_20260627 = "shipped (the parent track; this is the followup fix for the 1 remaining tier-3 failure)"
|
||||
|
||||
[blocks]
|
||||
|
||||
[phases]
|
||||
phase_0 = { status = "completed", checkpointsha = "75fdebb0", name = "Instrument + diagnose (3 commits: stderr diag, file-based diag, NameError root cause identification)" }
|
||||
phase_1 = { status = "in_progress", checkpointsha = "e9919059", name = "Fix the root cause (2 commits: TrackMetadata import fix, mock session_id routing fix, mock epic catch-all fix)" }
|
||||
phase_2 = { status = "pending", checkpointsha = "23862d35", name = "Remove instrumentation + write report (3 commits: cleanup, mock fix, TRACK_COMPLETION)" }
|
||||
|
||||
[tasks]
|
||||
t0_1 = { status = "completed", commit_sha = "75fdebb0", description = "Add stderr diagnostics to _start_track_logic_result" }
|
||||
t0_1b = { status = "completed", commit_sha = "d046394a", description = "Add file-based diag instrumentation (5 strategic points)" }
|
||||
t0_2 = { status = "completed", commit_sha = "75fdebb0", description = "Run the test in isolation; capture log; identify NameError as root cause" }
|
||||
t1_1 = { status = "completed", commit_sha = "e9919059", description = "Add TrackMetadata to import; change models.Metadata to TrackMetadata" }
|
||||
t1_1b = { status = "completed", commit_sha = "913aa48c", description = "Fix mock sprint routing (replace session_id-based with prompt-content-based)" }
|
||||
t1_1c = { status = "completed", commit_sha = "fad1755b", description = "Fix mock epic routing to be a catch-all for any non-empty prompt (stress test prompt 'STRESS TEST: TRACK A AND TRACK B' was not matched by the old literal 'PATH: Epic Initialization' check)" }
|
||||
t1_2 = { status = "completed", commit_sha = "e9919059", description = "Run the test in isolation to verify the fix (5 consecutive PASS runs of execution test)" }
|
||||
t1_2b = { status = "completed", commit_sha = "fad1755b", description = "Run both tests in isolation to verify the stress test fix (3 consecutive PASS runs)" }
|
||||
t1_3 = { status = "completed", commit_sha = "e9919059", description = "Verify no regressions in related tests (test_app_controller_result, test_conductor_tech_lead all pass except pre-existing broad_except test)" }
|
||||
t2_1 = { status = "completed", commit_sha = "23862d35", description = "Remove the stderr and file-based instrumentation from _start_track_logic_result" }
|
||||
t2_2 = { status = "completed", commit_sha = "fad1755b", description = "Update OUTSTANDING_MMA_TEST_FAILURES_20260627.md to add section 6 (stress test fix)" }
|
||||
t2_3 = { status = "in_progress", commit_sha = "", description = "Update TRACK_COMPLETION_fix_mma_concurrent_tracks_sim_20260627.md to include the stress test fix" }
|
||||
t2_4 = { status = "pending", commit_sha = "", description = "Update state.toml to status = completed; final SHIPPED commit" }
|
||||
|
||||
[verification]
|
||||
phase_0_complete = true
|
||||
phase_1_complete = true
|
||||
phase_2_complete = false
|
||||
|
||||
phase_0_diagnosis = "NameError: name 'models' is not defined at src/app_controller.py:4830"
|
||||
phase_1_fix_commits = ["e9919059", "913aa48c", "fad1755b"]
|
||||
phase_2_cleanup_commits = ["23862d35"]
|
||||
|
||||
[track_specific]
|
||||
test_failing = "tests/test_mma_concurrent_tracks_sim.py::test_mma_concurrent_tracks_execution AND tests/test_mma_concurrent_tracks_stress_sim.py::test_mma_concurrent_tracks_stress"
|
||||
parent_track = "post_module_taxonomy_de_cruft_20260627"
|
||||
parent_track_shipped_commit = "d74b9822"
|
||||
prior_partial_fix_commit = "635ca552"
|
||||
prior_fixes_in_635ca552 = [
|
||||
"flat.setdefault(...)[...] = ... on frozen ProjectContext (3 sites)",
|
||||
"t_data['id'] on Ticket objects (1 site)",
|
||||
"mock_concurrent_mma.py --resume handling (initial fix; superseded by 913aa48c and fad1755b)"
|
||||
]
|
||||
root_causes_identified = [
|
||||
"NameError: name 'models' is not defined at src/app_controller.py:4830 (missing TrackMetadata import after de-cruft migration removed 'from src import models')",
|
||||
"Mock sprint routing fragile to test ordering and session_id chain pattern (session_id='mock-sprint-A' incorrectly routed to sprint-A instead of sprint-B)",
|
||||
"Mock epic branch only matched literal 'PATH: Epic Initialization' (stress test prompt 'STRESS TEST: TRACK A AND TRACK B' fell to Default which returns text, not JSON)"
|
||||
]
|
||||
fixes_shipped = [
|
||||
"e9919059: Added TrackMetadata to 'from src.mma import' line; changed 'models.Metadata(...)' to 'TrackMetadata(...)'",
|
||||
"913aa48c: Replaced session_id-based mock sprint routing with prompt-content-based routing",
|
||||
"fad1755b: Restructured mock routing so sprint/worker checked first, then epic catch-all for any non-empty prompt"
|
||||
]
|
||||
stability_test = "3 consecutive PASS runs of BOTH tests (13.94s, 14.81s, 14.13s)"
|
||||
flakiness_rate = "0% (was previously 100% for stress test, ~25% for execution test)"
|
||||
audit_main_thread_imports = "OK: 28 files in main-thread import graph; no heavy top-level imports"
|
||||
audit_weak_types = "informational; no new violations"
|
||||
pre_existing_failures_remaining = ["test_app_controller_result.py::test_app_controller_does_not_use_broad_except (8 INTERNAL_BROAD_CATCH sites; not introduced by this track)"]
|
||||
followups = [
|
||||
"Run full 11-tier batched test suite for final verification (the user should run this after merge review)",
|
||||
"Add 'artifacts/' to .gitignore (mock counter file is project-tree but should be in tests/artifacts/ per workspace_paths.md)"
|
||||
]
|
||||
@@ -0,0 +1,107 @@
|
||||
{
|
||||
"track_id": "test_engine_integration_20260627",
|
||||
"name": "ImGui Test Engine Integration (Bridge via API Hooks)",
|
||||
"status": "active",
|
||||
"branch": "master",
|
||||
"created": "2026-06-27",
|
||||
"owner": "Tier 1 (initialized); implementation delegated to Tier 2/3.",
|
||||
"blocked_by": [],
|
||||
"blocks": ["test_engine_docking_tests (Track 2)", "test_engine_capture_regression (Track 3)"],
|
||||
"scope": {
|
||||
"new_files": [
|
||||
"tests/test_test_engine_smoke.py",
|
||||
"docs/reports/TRACK_COMPLETION_test_engine_integration_20260627.md"
|
||||
],
|
||||
"modified_files": [
|
||||
"sloppy.py (add --enable-test-engine CLI flag)",
|
||||
"src/app_controller.py (add test_engine_enabled field)",
|
||||
"src/gui_2.py (enable engine in App.run + _register_imgui_tests method)",
|
||||
"src/api_hooks.py (4 new /api/test_engine/* endpoints)",
|
||||
"src/api_hook_client.py (4 new client methods)",
|
||||
"tests/conftest.py (pass --enable-test-engine in live_gui fixture)",
|
||||
"conductor/tracks.md (add row)",
|
||||
"conductor/chronology.md (prepend row)"
|
||||
],
|
||||
"deleted_files": []
|
||||
},
|
||||
"estimated_effort": {
|
||||
"method": "scope (per workflow.md Tier 1 Track Initialization Rules. NO day estimates.)",
|
||||
"phase_1": "4 tasks: 1 failing test + 1 CLI flag + 1 engine enable + 1 manual verification",
|
||||
"phase_2": "4 tasks: 1 failing tests + 4 endpoints + 4 client methods + green verification",
|
||||
"phase_3": "2 tasks: 1 conftest update + 1 full smoke test verification",
|
||||
"phase_4": "3 tasks: 1 end-of-track report + 1 state update + 1 user sign-off"
|
||||
},
|
||||
"verification_criteria": [
|
||||
"G1: sloppy.py accepts --enable-test-engine; when set, runner_params.use_imgui_test_engine = True + callbacks.register_tests assigned",
|
||||
"G2: App._register_imgui_tests exists + registers at least 1 smoke test via imgui.test_engine.register_test",
|
||||
"G3: HookServer has 4 new /api/test_engine/* endpoints (queue, status, results, abort)",
|
||||
"G4: ApiHookClient has 4 new methods (queue_test, get_test_status, get_test_results, wait_for_test_results)",
|
||||
"G5: live_gui fixture passes --enable-test-engine in subprocess args",
|
||||
"G6: tests/test_test_engine_smoke.py has >=3 tests; all pass (engine enabled + queue+run smoke + results shape)",
|
||||
"G7: docs/reports/TRACK_COMPLETION_test_engine_integration_20260627.md exists; documents threading model verification + Track 2 handoff",
|
||||
"VC_parallel_safe": "ZERO file overlap with tier2/post_module_taxonomy_de_cruft_20260627 (touching sloppy.py, gui_2.py:641-700, api_hooks.py, api_hook_client.py, conftest.py — none of which Tier 2 touches) or enforcement_gap_closure_20260627 (touching scripts/audit_*, python.md — zero overlap)"
|
||||
],
|
||||
"regressions_and_pre_existing_failures": [],
|
||||
"pre_existing_failures_remaining": [],
|
||||
"deferred_to_followup_tracks": [
|
||||
{
|
||||
"title": "Track 2: test_engine_docking_tests",
|
||||
"description": "Migrate docking/focus/panel tests (test_workspace_profiles_restoration, test_auto_switch_sim, etc.) to use ctx.dock_into, ctx.window_focus, ctx.window_resize. The bridge built in this track enables it.",
|
||||
"track_status": "planned (Track 2 of 3)"
|
||||
},
|
||||
{
|
||||
"title": "Track 3: test_engine_capture_regression",
|
||||
"description": "Visual regression via ctx.capture_screenshot_window + baseline PNG diff. The capture API is available but not wired in this track.",
|
||||
"track_status": "planned (Track 3 of 3)"
|
||||
},
|
||||
{
|
||||
"title": "Headless test execution",
|
||||
"description": "The test engine requires a live GLFW window. Headless mode (no window) is a future research item; the engine's scenario thread drives the actual render loop.",
|
||||
"track_status": "not yet initialized; research item"
|
||||
},
|
||||
{
|
||||
"title": "Interactive test engine panel",
|
||||
"description": "show_test_engine_windows(engine, True) opens the engine's debug UI. Not shown by default; can be added as a debug toggle in a follow-up.",
|
||||
"track_status": "not yet initialized"
|
||||
}
|
||||
],
|
||||
"risk_register": [
|
||||
{
|
||||
"id": "R1",
|
||||
"description": "GIL-transfer crash: the test engine's scenario thread calls Python test_func from a different thread; if the GIL transfer mechanism in hello_imgui/immapp doesn't work with the app's existing thread layout, the app crashes",
|
||||
"likelihood": "medium",
|
||||
"impact": "hard blocker; the entire test engine approach is invalid if the threading model doesn't work",
|
||||
"mitigation": "Phase 1 Task 1.4 is a manual verification checkpoint that catches this before any further work. If it crashes, STOP and report to user. The demo_testengine.py proves the mechanism works for simple apps; the risk is specific to this app's thread layout (AppController, SyncEventQueue, etc.)"
|
||||
},
|
||||
{
|
||||
"id": "R2",
|
||||
"description": "Label path mismatch: the smoke test's ctx.set_ref('###manual slop') + ctx.item_click('**/Session') may not match the actual label tree",
|
||||
"likelihood": "high",
|
||||
"impact": "smoke test fails with 'item not found'; not a crash, just a wrong path",
|
||||
"mitigation": "Use imgui.show_id_stack_tool_window() or ctx.window_info() to find the correct labels during implementation. The label tree is deterministic (same build, same layout). Once found, the path is stable."
|
||||
},
|
||||
{
|
||||
"id": "R3",
|
||||
"description": "Engine overhead degrades live_gui test performance",
|
||||
"likelihood": "low",
|
||||
"impact": "live_gui tests take longer; batch run exceeds timeout",
|
||||
"mitigation": "The engine is idle when no tests are queued (sub-ms per-frame overhead). The existing fps_idling settings are unchanged. If measurable, the --enable-test-engine flag can be made conditional (only passed when running test_test_engine_* files)."
|
||||
},
|
||||
{
|
||||
"id": "R4",
|
||||
"description": "test_func accesses App state from the scenario thread, causing a race with the GUI render thread",
|
||||
"likelihood": "medium",
|
||||
"impact": "intermittent test failures or state corruption",
|
||||
"mitigation": "The spec FR2 + plan Task 1.3 explicitly document: test_func must NOT directly mutate App/AppController state; it must use ctx.* primitives (which post simulated input to the GUI thread). Reading via ctx.item_info / ctx.window_info is safe (C++ accessors). CHECK() runs on the scenario thread but only writes to the engine's C++ result log (thread-safe)."
|
||||
}
|
||||
],
|
||||
"campaign": {
|
||||
"name": "Test Engine Campaign (3 tracks)",
|
||||
"tracks": [
|
||||
"test_engine_integration_20260627 (THIS TRACK; bridge + smoke test)",
|
||||
"test_engine_docking_tests (Track 2; migrate docking/focus/panel tests)",
|
||||
"test_engine_capture_regression (Track 3; visual regression via screenshot capture)"
|
||||
],
|
||||
"campaign_rationale": "The test engine enables high-fidelity simulation of docking, focus, panel visibility, drag-and-drop, and keyboard input that the current Hook API cannot express. The campaign is split into 3 tracks to isolate risk: Track 1 proves the threading model + bridge work; Track 2 migrates the high-value docking tests; Track 3 adds visual regression. Each track is independently shippable."
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,163 @@
|
||||
# Plan: ImGui Test Engine Integration (Bridge via API Hooks)
|
||||
|
||||
Track: `test_engine_integration_20260627`
|
||||
Branch: master (parallel-safe; touches `sloppy.py`, `src/gui_2.py`, `src/app_controller.py`, `src/api_hooks.py`, `src/api_hook_client.py`, `tests/conftest.py`, new `tests/test_test_engine_smoke.py` — zero overlap with the running tier2 taxonomy branch or the enforcement_gap_closure track)
|
||||
Spec: `conductor/tracks/test_engine_integration_20260627/spec.md`
|
||||
|
||||
All Python edits use 1-space indentation. No comments in body. CRLF preserved.
|
||||
|
||||
---
|
||||
|
||||
## Phase 1: Enable the Test Engine in the App
|
||||
|
||||
Focus: Add `--enable-test-engine` CLI flag, set `runner_params.use_imgui_test_engine`, add the `register_tests` callback with a placeholder smoke test.
|
||||
|
||||
- [ ] Task 1.1: Write failing test for `--enable-test-engine` flag + engine activation
|
||||
- **WHERE:** `tests/test_test_engine_smoke.py` (NEW file)
|
||||
- **WHAT:** Test 1: `test_engine_enabled` — start `live_gui` (which will pass `--enable-test-engine`), verify the engine is active by calling `client.get_test_status()` (new method, implemented in Phase 3) and asserting `queue_empty == True` (engine is running, no tests queued). This test will FAIL before Phase 1 + Phase 3 land (the endpoint doesn't exist yet).
|
||||
- **HOW:** Use the `live_gui` fixture. Call `client.get_test_status()`. Assert the response has a `queue_empty` field. (The method is added in Phase 3; the test is written first per TDD.)
|
||||
- **SAFETY:** No `live_gui` state mutation; just a GET request.
|
||||
- **COMMIT:** `test(smoke): add failing test for test engine activation`
|
||||
- **GIT NOTE:** Red-phase test for the `--enable-test-engine` flag + engine activation.
|
||||
|
||||
- [ ] Task 1.2: Add `--enable-test-engine` CLI flag to `sloppy.py` + `AppController`
|
||||
- **WHERE:** `sloppy.py:35` (add arg), `src/app_controller.py:1042` (add `test_engine_enabled` field)
|
||||
- **WHAT:**
|
||||
1. `sloppy.py`: add `parser.add_argument("--enable-test-engine", action="store_true", help="Enable Dear ImGui Test Engine for automated UI testing")` after the `--enable-test-hooks` line.
|
||||
2. `src/app_controller.py:1042`: add `self.test_engine_enabled: bool = ("--enable-test-engine" in sys.argv)` after the `test_hooks_enabled` line.
|
||||
- **HOW:** Use `manual-slop_edit_file` MCP tool. 1-space indent.
|
||||
- **SAFETY:** The flag is opt-in; normal runs are unaffected.
|
||||
- **COMMIT:** `feat(cli): add --enable-test-engine flag`
|
||||
- **GIT NOTE:** CLI flag for test engine; mirrors the --enable-test-hooks pattern at app_controller.py:1042.
|
||||
|
||||
- [ ] Task 1.3: Enable the engine in `App.run()` + add `_register_imgui_tests` callback
|
||||
- **WHERE:** `src/gui_2.py:641` (after `RunnerParams()` construction) + `src/gui_2.py:~700` (new `_register_imgui_tests` method)
|
||||
- **WHAT:**
|
||||
1. In `App.run()` between line 641 (`self.runner_params = _hi.RunnerParams()`) and line 684 (`callbacks.show_gui = ...`), add:
|
||||
```python
|
||||
if getattr(self.controller, "test_engine_enabled", False):
|
||||
self.runner_params.use_imgui_test_engine = True
|
||||
self.runner_params.callbacks.register_tests = self._register_imgui_tests
|
||||
```
|
||||
2. Add `_register_imgui_tests(self)` method on `App` (after `_post_init`, ~line 700):
|
||||
```python
|
||||
def _register_imgui_tests(self) -> None:
|
||||
from imgui_bundle import hello_imgui
|
||||
from imgui_bundle.imgui import test_engine
|
||||
engine = hello_imgui.get_imgui_test_engine()
|
||||
test = test_engine.register_test(engine, "Smoke Tests", "Tab Switch")
|
||||
def smoke_func(ctx) -> None:
|
||||
from imgui_bundle.imgui.test_engine_checks import CHECK
|
||||
ctx.set_ref("###manual slop")
|
||||
ctx.item_click("**/Session")
|
||||
CHECK(True)
|
||||
test.test_func = smoke_func
|
||||
```
|
||||
The exact `set_ref` + `item_click` targets are determined during implementation by inspecting the running GUI's label tree. The smoke test should click a harmless tab (e.g., switch to "Session" tab) and `CHECK(True)` as a placeholder assertion. The real assertion (verify the tab actually switched) is added once the label path is confirmed.
|
||||
- **HOW:** Use `manual-slop_edit_file` / `manual-slop_py_update_definition` MCP tool. 1-space indent.
|
||||
- **SAFETY:** Guarded by `test_engine_enabled`; normal runs skip this entirely. The `register_tests` callback is only called by `hello_imgui` when `use_imgui_test_engine = True`.
|
||||
- **COMMIT:** `feat(gui): enable test engine + register smoke test via callbacks.register_tests`
|
||||
- **GIT NOTE:** Activates the test engine when --enable-test-engine is set; registers a placeholder smoke test.
|
||||
|
||||
- [ ] Task 1.4: Verify the engine activates (manual)
|
||||
- **WHAT:** Run `uv run python sloppy.py --enable-test-hooks --enable-test-engine` locally. Verify the app starts without crashing (the GIL-transfer mechanism works). Verify `hello_imgui.get_imgui_test_engine()` returns a non-None engine. This is a manual checkpoint before proceeding to Phase 2.
|
||||
- **COMMIT:** (no commit; manual verification checkpoint)
|
||||
- **GIT NOTE:** Manual verification that the engine + GIL transfer works with the app's existing thread layout.
|
||||
|
||||
---
|
||||
|
||||
## Phase 2: Build the API Hooks Bridge
|
||||
|
||||
Focus: Add the 4 `/api/test_engine/*` endpoints to `HookServer` + the 4 methods to `ApiHookClient`.
|
||||
|
||||
- [ ] Task 2.1: Write failing tests for the 4 new `ApiHookClient` methods
|
||||
- **WHERE:** `tests/test_test_engine_smoke.py` (append to the file from Task 1.1)
|
||||
- **WHAT:** 2 more tests:
|
||||
- `test_queue_and_run_smoke_test`: queue the smoke test via `client.queue_test("Smoke Tests", "Tab Switch")`, poll via `client.wait_for_test_results(timeout=30)`, assert `results["count_success"] >= 1` and `results["count_tested"] >= 1`.
|
||||
- `test_engine_results_shape`: call `client.get_test_results()`, assert the response dict has keys `count_tested`, `count_success`, `count_in_queue`.
|
||||
- **HOW:** Use `live_gui` fixture. These tests fail until Phase 2 + Phase 3 land (the client methods + endpoints don't exist yet).
|
||||
- **SAFETY:** The smoke test queues a harmless tab-switch; no destructive state change.
|
||||
- **COMMIT:** `test(smoke): add failing tests for queue_test + wait_for_test_results + get_test_results`
|
||||
- **GIT NOTE:** Red-phase tests for the 4 new ApiHookClient methods.
|
||||
|
||||
- [ ] Task 2.2: Add the 4 `/api/test_engine/*` endpoints to `HookServer`
|
||||
- **WHERE:** `src/api_hooks.py` — `do_GET` (line 157) + `do_POST` (line 490)
|
||||
- **WHAT:** Add 4 new `elif` branches:
|
||||
1. `do_GET`: `elif self.path == "/api/test_engine/status":` — lazy-import `hello_imgui` + `test_engine`; get engine via `hello_imgui.get_imgui_test_engine()`; call `test_engine.is_test_queue_empty(engine)`; respond `{"queue_empty": bool}`.
|
||||
2. `do_GET`: `elif self.path == "/api/test_engine/results":` — get engine; create `TestEngineResultSummary()`; call `test_engine.get_result_summary(engine, out_results)`; respond `{"count_tested": N, "count_success": N, "count_in_queue": N}`.
|
||||
3. `do_POST`: `elif self.path == "/api/test_engine/queue":` — body `{"group": "...", "name": "..."}`; get engine; find test via `test_engine.find_test_by_name(engine, group, name)`; if found, `test_engine.queue_test(engine, test)`; respond `{"status": "queued"}` or `{"error": "test not found"}` (404).
|
||||
4. `do_POST`: `elif self.path == "/api/test_engine/abort":` — get engine; `test_engine.abort_current_test(engine)`; respond `{"status": "aborted"}`.
|
||||
- **HOW:** Follow the existing endpoint pattern (lines 499-505 for POST, lines 231-241 for GET). Use `_get_app_attr(app, "controller")` to check `test_engine_enabled`; if not enabled, respond 503. Use `json.dumps(...)` for the response body. 1-space indent.
|
||||
- **SAFETY:** The endpoints run on the HTTP handler thread. `hello_imgui.get_imgui_test_engine()` is a C++ accessor (thread-safe). `queue_test` / `is_test_queue_empty` / `get_result_summary` are thread-safe C++ engine operations (the engine is designed for cross-thread test scheduling). `abort_current_test` is also thread-safe.
|
||||
- **COMMIT:** `feat(api_hooks): add /api/test_engine/* bridge endpoints`
|
||||
- **GIT NOTE:** 4 new endpoints: queue, status, results, abort; bridge the test process to the engine via HTTP.
|
||||
|
||||
- [ ] Task 2.3: Add the 4 new methods to `ApiHookClient`
|
||||
- **WHERE:** `src/api_hook_client.py` (after the existing methods, ~line 500)
|
||||
- **WHAT:** 4 new methods:
|
||||
1. `queue_test(self, group: str, name: str) -> dict` — POST `/api/test_engine/queue` with `{"group": group, "name": name}`; return the response dict.
|
||||
2. `get_test_status(self) -> dict` — GET `/api/test_engine/status`; return `{"queue_empty": bool}`.
|
||||
3. `get_test_results(self) -> dict` — GET `/api/test_engine/results`; return `{"count_tested": N, "count_success": N, "count_in_queue": N}`.
|
||||
4. `wait_for_test_results(self, timeout: float = 30.0) -> dict` — poll `get_test_status()` every 0.5s until `queue_empty == True` or timeout; then return `get_test_results()`. On timeout, return the last results (with a `timed_out: True` field).
|
||||
- **HOW:** Follow the existing method pattern (e.g., `get_status` at line 105, `push_event` at line 156). Use `requests.get/post` + retry. 1-space indent.
|
||||
- **SAFETY:** Pure HTTP client; no thread safety concerns.
|
||||
- **COMMIT:** `feat(api_hook_client): add queue_test + get_test_status + get_test_results + wait_for_test_results`
|
||||
- **GIT NOTE:** 4 new client methods mirroring the 4 new endpoints; wait_for_test_results replaces time.sleep+get_value polling.
|
||||
|
||||
- [ ] Task 2.4: Run Phase 2 tests (Green phase)
|
||||
- **WHAT:** `uv run pytest tests/test_test_engine_smoke.py -v --timeout=60`. All 3 tests must pass. If the smoke test (test_queue_and_run_smoke_test) fails, the most likely cause is the `set_ref` / `item_click` label path being wrong — debug by using `imgui.show_id_stack_tool_window()` or `ctx.window_info("manual slop")` to find the correct label. If the GIL transfer fails, the app will crash — that's a hard blocker; report to user.
|
||||
- **COMMIT:** `conductor(state): Phase 2 green-phase verification` (or skip if no code changes)
|
||||
- **GIT NOTE:** Green-phase verification for the 4 new endpoints + 4 new client methods.
|
||||
|
||||
---
|
||||
|
||||
## Phase 3: Live_gui Fixture + Full Smoke Test
|
||||
|
||||
Focus: Pass `--enable-test-engine` in the `live_gui` fixture + verify the full bridge works end-to-end.
|
||||
|
||||
- [ ] Task 3.1: Update `live_gui` fixture to pass `--enable-test-engine`
|
||||
- **WHERE:** `tests/conftest.py:792`
|
||||
- **WHAT:** Change `gui_args = ["uv", "run", "python", "-u", gui_script, "--enable-test-hooks"]` to include `"--enable-test-engine"`:
|
||||
```python
|
||||
gui_args = ["uv", "run", "python", "-u", gui_script, "--enable-test-hooks", "--enable-test-engine"]
|
||||
```
|
||||
- **HOW:** `manual-slop_edit_file` MCP tool. 1-space indent.
|
||||
- **SAFETY:** The engine is idle when no tests are queued. Existing `live_gui` tests that don't use the test engine are unaffected (the engine adds sub-ms per-frame overhead).
|
||||
- **COMMIT:** `test(conftest): pass --enable-test-engine in live_gui fixture`
|
||||
- **GIT NOTE:** Engine activates on every live_gui run; idle when no tests queued.
|
||||
|
||||
- [ ] Task 3.2: Run the full smoke test suite (Green phase)
|
||||
- **WHAT:** `uv run pytest tests/test_test_engine_smoke.py -v --timeout=60`. All 3 tests pass. Then run a small batch of existing `live_gui` tests to verify no regression: `uv run pytest tests/test_workspace_profiles_restoration.py tests/test_undo_redo_lifecycle.py -v --timeout=120`.
|
||||
- **COMMIT:** `conductor(state): Phase 3 green-phase verification`
|
||||
- **GIT NOTE:** Full bridge verified: pytest → HTTP → HookServer → engine → scenario thread → ctx.item_click → GUI thread → CHECK → results → HTTP → pytest assert.
|
||||
|
||||
---
|
||||
|
||||
## Phase 4: End-of-Track Report + State Update
|
||||
|
||||
- [ ] Task 4.1: Write end-of-track report
|
||||
- **WHERE:** `docs/reports/TRACK_COMPLETION_test_engine_integration_20260627.md` (NEW file)
|
||||
- **WHAT:** Report following the precedent:
|
||||
- TL;DR
|
||||
- Phase summary (each phase + commits + status)
|
||||
- Verification Criteria status (mapped to spec G1-G7)
|
||||
- Threading model verification (did the GIL transfer work? any crashes? any state-access issues from the scenario thread?)
|
||||
- The 4 new endpoints + 4 new client methods documented
|
||||
- The smoke test result
|
||||
- Handoff to Track 2 (docking test migration) — what's now possible that wasn't before
|
||||
- Known limitations (engine requires a live window; not headless; the interactive panel is not shown)
|
||||
- **COMMIT:** `docs(reports): TRACK_COMPLETION_test_engine_integration_20260627`
|
||||
- **GIT NOTE:** End-of-track report; documents the bridge + threading model verification + Track 2 handoff.
|
||||
|
||||
- [ ] Task 4.2: Update `conductor/tracks.md` + `conductor/chronology.md` + `state.toml`
|
||||
- **WHAT:**
|
||||
1. `state.toml`: mark all phases "completed" with checkpoint SHA; `status = "completed"`.
|
||||
2. `conductor/tracks.md`: add row for this track (status "shipped").
|
||||
3. `conductor/chronology.md`: prepend row for `2026-06-27 | test_engine_integration_20260627 | shipped | ...`.
|
||||
- **COMMIT:** `conductor(state): test_engine_integration_20260627 SHIPPED + TRACK_COMPLETION`
|
||||
- **GIT NOTE:** Track state + chronology + tracks.md closed out.
|
||||
|
||||
- [ ] Task 4.3: Conductor - User Manual Verification
|
||||
- **WHAT:** Present the results: the smoke test pass, the threading model verification, the 4 new endpoints, the 4 new client methods. PAUSE for user sign-off.
|
||||
- **COMMIT:** (no commit; user-confirmation gate)
|
||||
- **GIT NOTE:** User sign-off record.
|
||||
@@ -0,0 +1,187 @@
|
||||
# Track Specification: ImGui Test Engine Integration (Bridge via API Hooks)
|
||||
|
||||
## Overview
|
||||
|
||||
Integrate the Dear ImGui Test Engine (`imgui_bundle.imgui.test_engine`) into Manual Slop's test infrastructure to enable high-fidelity simulation of user interactions — docking, window focus, panel visibility, drag-and-drop, keyboard input — that the current Hook API cannot express.
|
||||
|
||||
**The design principle:** the API hooks layer (`HookServer` on :8999 + `ApiHookClient`) remains the **single communication boundary** between the test process (pytest) and the GUI subprocess. The test engine is integrated *behind* the API hooks, not alongside them. New `/api/test_engine/*` endpoints bridge the test process to the engine's `queue_test` / `get_result_summary` API. The engine's `test_func` closures run on the engine's scenario thread (GIL-transferred by `hello_imgui`/`immapp`); they use `ctx.item_click("**/Label")`, `ctx.dock_into(src, dst, dir)`, `ctx.window_focus(ref)` etc. to post simulated input events to the GUI render thread. The existing `_pending_gui_tasks` queue and the engine's input simulation are two separate event injection paths into the same GUI thread; they compose without conflict.
|
||||
|
||||
This is **Track 1 of 3** in the test engine campaign. Track 1 = enable the engine + build the bridge + smoke test. Track 2 (follow-up) = migrate docking/focus/panel tests. Track 3 (follow-up) = visual regression via screenshot capture.
|
||||
|
||||
## Current State Audit (as of master `77b70226`)
|
||||
|
||||
### Already Implemented (DO NOT re-implement)
|
||||
|
||||
- **`imgui_bundle` v1.92.5** (pinned in `pyproject.toml:7`) ships the test engine compiled into the nanobind binary. Verified: `from imgui_bundle import imgui; imgui.test_engine.TestEngine` is a live class; `imgui.test_engine.register_test`, `imgui.test_engine.queue_test`, `imgui.test_engine.get_result_summary`, `imgui.test_engine.TestContext` with `dock_into`, `window_focus`, `item_click`, `capture_screenshot_window`, etc. are all present (verified via `dir()` enumeration — ~95 `TestContext` methods + ~35 module-level functions). The `.pyi` stub at `.venv/Lib/site-packages/imgui_bundle/imgui/test_engine.pyi` documents the full API.
|
||||
|
||||
- **`hello_imgui.RunnerParams.use_imgui_test_engine: bool = False`** (`.venv/Lib/site-packages/imgui_bundle/hello_imgui.pyi:2969`) — the flag that enables the engine. When `True`, `hello_imgui`/`immapp` compiles the engine into the runner and provides the GIL-transfer mechanism for the scenario thread. The engine is **already compiled into the wheel** (the C++ build flag `-DHELLOIMGUI_WITH_TEST_ENGINE=ON` was set for the published wheel); the Python-side flag just activates it.
|
||||
|
||||
- **`hello_imgui.get_imgui_test_engine()`** (`.venv/Lib/site-packages/imgui_bundle/hello_imgui.pyi:3355`) — returns the live `TestEngine` instance after `use_imgui_test_engine = True`. Verified callable.
|
||||
|
||||
- **`RunnerCallbacks.register_tests: VoidFunction`** (`.venv/Lib/site-packages/imgui_bundle/hello_imgui.pyi:1809`) — the callback that `hello_imgui` invokes at startup to let the app register tests via `imgui.test_engine.register_test(engine, group, name)`. The demo at `.venv/Lib/site-packages/imgui_bundle/demos_python/demos_immapp/demo_testengine.py` shows the full pattern.
|
||||
|
||||
- **`imgui_bundle.imgui.test_engine_checks.CHECK(result: bool)`** — the assertion primitive that emits pass/fail to the engine's result log with file:line traceback. Verified importable.
|
||||
|
||||
- **The app already uses `hello_imgui.RunnerParams` + `immapp.run()`** — the exact integration path the test engine requires:
|
||||
- `src/gui_2.py:641`: `self.runner_params = _hi.RunnerParams()`
|
||||
- `src/gui_2.py:684-688`: `self.runner_params.callbacks.show_gui/show_menus/load_additional_fonts/setup_imgui_style/post_init` are set
|
||||
- `src/gui_2.py:1486`: `immapp.run(app.runner_params, ...)` — the main loop entry point
|
||||
- The GIL-transfer mechanism is built into `immapp.run` when `use_imgui_test_engine = True`; no additional threading code is needed on the Python side.
|
||||
|
||||
- **`HookServer`** (`src/api_hooks.py:857`) — the HTTP server on `127.0.0.1:8999`, started when `--enable-test-hooks` is passed. The `do_GET` method (line 157) and `do_POST` method (line 490) use a flat `if/elif self.path == "/api/..."` dispatch. The server holds `self.app` (the `App` instance) and accesses it via `_get_app_attr(app, ...)` helpers. The `_pending_gui_tasks` queue (`app_controller.py:900`) + `_pending_gui_tasks_lock` (`app_controller.py:822`) + `_process_pending_gui_tasks()` (`app_controller.py:1844`, called per-frame from `gui_2.py:1759`) is the existing thread-safe command queue from HTTP handler thread → main render thread.
|
||||
|
||||
- **`ApiHookClient`** (`src/api_hook_client.py`) — the Python client with retry logic, health-check polling, `wait_for_server(timeout)`, `push_event(action, payload)`, `get_value(item)`, `set_value(item, value)`, `click(item)`, `wait_for_event(event_type, timeout)`, etc. Used by all `live_gui` tests.
|
||||
|
||||
- **`live_gui` fixture** (`tests/conftest.py:641`) — session-scoped; spawns `sloppy.py --enable-test-hooks --config=<temp>` as a subprocess; polls `http://127.0.0.1:8999/status` until ready; yields a `_LiveGuiHandle` with `.client` (an `ApiHookClient`), `.process`, `.workspace`. The fixture's subprocess args are at `conftest.py:792`: `gui_args = ["uv", "run", "python", "-u", gui_script, "--enable-test-hooks"]`.
|
||||
|
||||
- **`sloppy.py`** (79 lines) — the entry point. CLI flags at lines 31-36: `--headless`, `--web-host`, `--web-port`, `--enable-test-hooks`, `--config`. The `else` branch at line 75 (the normal GUI mode) calls `from src.gui_2 import main; main()`.
|
||||
|
||||
- **`AppController.test_hooks_enabled`** (`src/app_controller.py:1042`) — set via `"--enable-test-hooks" in sys.argv` or `SLOP_TEST_HOOKS=1` env var. Same pattern works for `--enable-test-engine`.
|
||||
|
||||
### Gaps to Fill (This Track's Scope)
|
||||
|
||||
- **GAP-1: The test engine is not enabled.** `runner_params.use_imgui_test_engine` is never set to `True`. No `callbacks.register_tests` callback exists. The engine's scenario thread + GIL-transfer mechanism are dormant.
|
||||
|
||||
- **GAP-2: No `/api/test_engine/*` bridge endpoints.** The `HookServer` has no way for the test process to queue a test, poll results, or abort a running test. The test engine API (`queue_test`, `get_result_summary`, `is_test_queue_empty`, `abort_current_test`) is only accessible from inside the GUI process — not from the HTTP boundary.
|
||||
|
||||
- **GAP-3: No `ApiHookClient` methods for test engine operations.** The client has `click`, `set_value`, `push_event`, `wait_for_event` — but no `queue_test`, `wait_for_test_results`, `get_test_results`.
|
||||
|
||||
- **GAP-4: `live_gui` fixture doesn't pass `--enable-test-engine`.** The subprocess at `conftest.py:792` only passes `--enable-test-hooks`. Without the engine flag, the engine won't activate even after GAP-1 is fixed.
|
||||
|
||||
- **GAP-5: No smoke test proving the end-to-end threading model works.** The test engine's scenario thread + GIL transfer is the highest-risk piece. A minimal smoke test (register a trivial test that clicks a known button + asserts a state change, queue it via the API, poll for results, assert pass) is needed to prove the bridge works before Track 2 migrates real tests.
|
||||
|
||||
### Architecture: Why the API hooks + test engine compose
|
||||
|
||||
```
|
||||
pytest test process
|
||||
└── ApiHookClient (HTTP :8999) ← single communication boundary (KEPT)
|
||||
└── HookServer.do_POST ← new /api/test_engine/* endpoints
|
||||
└── imgui.test_engine.queue_test(engine, test) ← schedules on engine
|
||||
└── TestContext.test_func(ctx) ← runs on engine scenario thread
|
||||
└── ctx.item_click("**/Label") ← posts simulated input to GUI thread
|
||||
└── GUI render thread processes the simulated event
|
||||
└── _process_pending_gui_tasks() still runs per-frame
|
||||
(existing queue; unaffected; two separate injection paths)
|
||||
```
|
||||
|
||||
The test engine's `test_func` runs on its own thread (the scenario thread). The `ctx.*` primitives post simulated input events to the ImGui input queue on the GUI render thread. This is the same destination as real user input and the same destination as `_pending_gui_tasks` — but a different injection mechanism. The two paths are independent; they don't share state, locks, or queues. The test engine doesn't touch `_pending_gui_tasks` and vice versa.
|
||||
|
||||
The GIL-transfer caveat (documented at the top of `test_engine.pyi`) is handled by `hello_imgui`/`immapp` when `use_imgui_test_engine = True` — the C++ layer transfers the GIL between the main thread and the scenario thread. No additional Python-side threading code is needed. The `test_func` callback runs with the GIL held; it can safely call `ctx.*` primitives (which are C++ nanobind calls that release the GIL during the simulated input wait).
|
||||
|
||||
## Goals
|
||||
|
||||
- **G1.** `sloppy.py` accepts `--enable-test-engine` CLI flag; when set, `App.run()` sets `runner_params.use_imgui_test_engine = True` + assigns `runner_params.callbacks.register_tests` to a method that registers tests.
|
||||
|
||||
- **G2.** `App` has a `_register_imgui_tests(self)` method (called by `hello_imgui` at startup via the `register_tests` callback) that registers at least one smoke test ("Smoke Tests", "Click Increment Button") via `imgui.test_engine.register_test(engine, group, name)`. The smoke test's `test_func(ctx)` calls `ctx.set_ref("...")` + `ctx.item_click("**/...")` + `CHECK(...)`.
|
||||
|
||||
- **G3.** `HookServer` (in `src/api_hooks.py`) has 4 new endpoints:
|
||||
- `POST /api/test_engine/queue` — body `{"group": "...", "name": "..."}`; finds the test by group+name via `imgui.test_engine.find_test_by_name(engine, group, name)`; calls `queue_test(engine, test)`; responds `{"status": "queued"}`.
|
||||
- `GET /api/test_engine/status` — calls `is_test_queue_empty(engine)`; responds `{"queue_empty": true/false}`.
|
||||
- `GET /api/test_engine/results` — calls `get_result_summary(engine, out_results)`; responds `{"count_tested": N, "count_success": N, "count_in_queue": N}`.
|
||||
- `POST /api/test_engine/abort` — calls `abort_current_test(engine)`; responds `{"status": "aborted"}`.
|
||||
|
||||
- **G4.** `ApiHookClient` (in `src/api_hook_client.py`) has 4 new methods:
|
||||
- `queue_test(group: str, name: str) -> dict` — POST to `/api/test_engine/queue`.
|
||||
- `get_test_status() -> dict` — GET `/api/test_engine/status`.
|
||||
- `get_test_results() -> dict` — GET `/api/test_engine/results`.
|
||||
- `wait_for_test_results(timeout: float = 30.0) -> dict` — polls `get_test_status()` until `queue_empty == True` or timeout; then returns `get_test_results()`.
|
||||
|
||||
- **G5.** The `live_gui` fixture passes `--enable-test-engine` in addition to `--enable-test-hooks` in the subprocess args (`conftest.py:792`). The engine activates on every `live_gui` test run.
|
||||
|
||||
- **G6.** A smoke test in `tests/test_test_engine_smoke.py` that:
|
||||
1. Uses the `live_gui` fixture.
|
||||
2. Queues the smoke test via `client.queue_test("Smoke Tests", "Click Increment Button")`.
|
||||
3. Polls via `client.wait_for_test_results(timeout=30)`.
|
||||
4. Asserts `results["count_success"] >= 1` and `results["count_tested"] >= 1`.
|
||||
This proves the full bridge works: pytest → HTTP → HookServer → engine → scenario thread → `ctx.item_click` → GUI thread → state change → `CHECK` → result log → `get_result_summary` → HTTP → pytest assert.
|
||||
|
||||
- **G7.** End-of-track report at `docs/reports/TRACK_COMPLETION_test_engine_integration_20260627.md` documenting: what shipped, the threading model verification, any GIL-transfer issues encountered, and the handoff to Track 2 (docking test migration).
|
||||
|
||||
## Functional Requirements
|
||||
|
||||
### FR1: `--enable-test-engine` CLI flag
|
||||
|
||||
- `sloppy.py`: add `parser.add_argument("--enable-test-engine", action="store_true", help="Enable the Dear ImGui Test Engine for automated UI testing")` alongside the existing `--enable-test-hooks` flag (line 35).
|
||||
- `src/app_controller.py`: add `self.test_engine_enabled: bool = ("--enable-test-engine" in sys.argv)` near line 1042 (same pattern as `test_hooks_enabled`).
|
||||
- `src/gui_2.py` `App.run()` (line 619): between the `RunnerParams()` construction (line 641) and the `callbacks.show_gui = ...` assignments (line 684), add:
|
||||
```python
|
||||
if getattr(self.controller, "test_engine_enabled", False):
|
||||
self.runner_params.use_imgui_test_engine = True
|
||||
self.runner_params.callbacks.register_tests = self._register_imgui_tests
|
||||
```
|
||||
This is guarded by the flag so normal runs are unaffected.
|
||||
|
||||
### FR2: `App._register_imgui_tests(self)` method
|
||||
|
||||
- New method on `App` in `src/gui_2.py` (near the other callback registrations, ~line 700):
|
||||
```python
|
||||
def _register_imgui_tests(self) -> None:
|
||||
"""Called by hello_imgui at startup to register ImGui Test Engine tests.
|
||||
Reads the live engine via hello_imgui.get_imgui_test_engine().
|
||||
[C: src/gui_2.py:App.run (via callbacks.register_tests)]
|
||||
"""
|
||||
from imgui_bundle import hello_imgui
|
||||
from imgui_bundle.imgui import test_engine
|
||||
engine = hello_imgui.get_imgui_test_engine()
|
||||
# Smoke test: click a known button and verify state change
|
||||
test = test_engine.register_test(engine, "Smoke Tests", "Click Increment Button")
|
||||
def smoke_func(ctx) -> None:
|
||||
from imgui_bundle.imgui.test_engine_checks import CHECK
|
||||
ctx.set_ref("...") # TODO: set to a known window
|
||||
ctx.item_click("**/...") # TODO: click a known button
|
||||
CHECK(True) # TODO: verify state change
|
||||
test.test_func = smoke_func
|
||||
```
|
||||
The exact button + state to click + verify is determined during implementation by inspecting the running GUI's item tree (use `ctx.window_info` / `imgui.show_id_stack_tool_window` to find labels). The smoke test should click something harmless (e.g., a tab switch, a checkbox toggle) and verify the state changed.
|
||||
|
||||
### FR3: `/api/test_engine/*` endpoints in `HookServer`
|
||||
|
||||
- In `src/api_hooks.py` `do_POST` (line 490): add 2 new `elif` branches for `POST /api/test_engine/queue` and `POST /api/test_engine/abort`.
|
||||
- In `src/api_hooks.py` `do_GET` (line 157): add 2 new `elif` branches for `GET /api/test_engine/status` and `GET /api/test_engine/results`.
|
||||
- All 4 endpoints guard on `test_engine_enabled` — if the engine is not active, respond `{"error": "test engine not enabled", "enabled": false}` with HTTP 503.
|
||||
- The engine instance is obtained via `hello_imgui.get_imgui_test_engine()` inside the handler (lazy import; the handler runs on the HTTP thread, but `get_imgui_test_engine()` is a C++ accessor that returns a pointer — safe to call from any thread).
|
||||
|
||||
### FR4: `ApiHookClient` methods
|
||||
|
||||
- In `src/api_hook_client.py`: add 4 methods per G4. Follow the existing method pattern (e.g., `get_status`, `push_event`): construct the URL, `requests.post/get`, retry on connection error, parse JSON, return the dict.
|
||||
|
||||
### FR5: `live_gui` fixture update
|
||||
|
||||
- In `tests/conftest.py:792`: change `gui_args` to include `"--enable-test-engine"` when the fixture spawns the subprocess. The flag flows through to `AppController.test_engine_enabled` → `App.run()` → `runner_params.use_imgui_test_engine = True`.
|
||||
|
||||
### FR6: Smoke test
|
||||
|
||||
- `tests/test_test_engine_smoke.py` (NEW) — 2-3 tests:
|
||||
- `test_engine_enabled`: `client.get_value("test_engine_enabled")` returns True (or verify via a new gettable field).
|
||||
- `test_queue_and_run_smoke_test`: queue the smoke test, poll for results, assert success.
|
||||
- `test_engine_results_shape`: `get_test_results()` returns the expected dict shape.
|
||||
|
||||
## Non-Functional Requirements
|
||||
|
||||
- **1-space indentation** for all Python code.
|
||||
- **No comments in body** per AGENTS.md.
|
||||
- **CRLF line endings** preserved.
|
||||
- **Atomic per-task commits.**
|
||||
- **Thread safety:** the `test_func` runs on the engine scenario thread. It must NOT directly mutate `App` or `AppController` state — it must use `ctx.*` primitives (which post simulated input to the GUI thread). Reading state via `hello_imgui.get_imgui_test_engine()` or engine queries (`ctx.item_info`, `ctx.window_info`) is safe. The `CHECK()` assertion runs on the scenario thread but only writes to the engine's result log (thread-safe C++ structure).
|
||||
- **No `live_gui` regression:** the `--enable-test-engine` flag must not affect normal GUI behavior when `live_gui` tests are NOT using the engine. The engine's scenario thread is idle when no tests are queued. The `show_test_engine_windows` panel is NOT shown by default (only via explicit call).
|
||||
- **Performance:** the engine adds a per-frame overhead when active. The `fps_idling` settings in `runner_params` remain unchanged. The engine's overhead is sub-millisecond per frame when no tests are running.
|
||||
|
||||
## Architecture Reference
|
||||
|
||||
- **`docs/guide_testing.md`** — the `live_gui` fixture, the structural testing contract, the Puppeteer pattern.
|
||||
- **`docs/guide_api_hooks.md`** — the Hook API surface, the `/api/ask` protocol, the `ApiHookClient` method reference.
|
||||
- **`docs/guide_gui_2.md`** — the `App` class lifecycle, the `runner_params` construction, the `callbacks` system.
|
||||
- **`.venv/Lib/site-packages/imgui_bundle/demos_python/demos_immapp/demo_testengine.py`** — the canonical demo for the test engine integration pattern (register_tests callback + test_func closures + CHECK).
|
||||
- **`.venv/Lib/site-packages/imgui_bundle/imgui/test_engine.pyi`** — the full API stub (2644 lines). Key sections: `TestContext` methods (lines 1445-2096), module-level functions (lines 433-500, 2639+), `TestEngineResultSummary` (3 fields: count_tested, count_success, count_in_queue).
|
||||
- **`.venv/Lib/site-packages/imgui_bundle/imgui/test_engine_checks.py`** — the `CHECK(result: bool)` assertion primitive.
|
||||
- **`conductor/workflow.md`** "Live_gui Test Fragility" + "Async Setters Need Poll-For-State" — the existing patterns for `live_gui` tests; the test engine's `wait_for_test_results` replaces `time.sleep` + `get_value` polling with a single engine-side poll.
|
||||
|
||||
## Out of Scope
|
||||
|
||||
- **Migrating existing `live_gui` tests to the test engine.** That's Track 2 (`test_engine_docking_tests_<date>`). This track only builds the bridge + proves it works with 1 smoke test.
|
||||
- **Visual regression via screenshot capture.** That's Track 3 (`test_engine_capture_regression_<date>`). The `ctx.capture_screenshot_window` API is available but not wired in this track.
|
||||
- **Headless test execution (no GUI window).** The test engine requires a live GLFW window (the scenario thread drives the actual ImGui render loop). Headless mode is a future research item, not this track.
|
||||
- **The test engine's interactive UI panel (`show_test_engine_windows`).** Not shown by default. Can be added as a debug toggle in a follow-up.
|
||||
- **Test engine license audit.** Per the stub: "free for individuals, educational, open-source, and small businesses. Paid for larger businesses." This project is personal-use; no audit needed. Flagged for awareness only.
|
||||
- **CI wiring of the test engine.** The `live_gui` fixture already runs in CI via the batched runner. The `--enable-test-engine` flag is additive. No CI config changes needed.
|
||||
- **Touching `src/models.py` or any taxonomy files.** Zero overlap with the running `tier2/post_module_taxonomy_de_cruft_20260627` branch or the `enforcement_gap_closure_20260627` track.
|
||||
@@ -0,0 +1,64 @@
|
||||
# Track state for test_engine_integration_20260627
|
||||
# Initialized by Tier 1 Orchestrator on 2026-06-27.
|
||||
# Implementation delegated to Tier 2 (autonomous) or Tier 3 worker dispatch.
|
||||
# This is Track 1 of 3 in the Test Engine Campaign.
|
||||
|
||||
[meta]
|
||||
track_id = "test_engine_integration_20260627"
|
||||
name = "ImGui Test Engine Integration (Bridge via API Hooks)"
|
||||
status = "active"
|
||||
current_phase = 0
|
||||
last_updated = "2026-06-27"
|
||||
|
||||
[blocked_by]
|
||||
# None. Parallel-safe against tier2/post_module_taxonomy_de_cruft_20260627
|
||||
# (zero file overlap: this track touches sloppy.py, gui_2.py:641-700,
|
||||
# api_hooks.py, api_hook_client.py, conftest.py — none of which Tier 2 touches)
|
||||
# and enforcement_gap_closure_20260627 (scripts/audit_*, python.md — zero overlap).
|
||||
|
||||
[blocks]
|
||||
test_engine_docking_tests = "planned (Track 2 of 3 campaign)"
|
||||
test_engine_capture_regression = "planned (Track 3 of 3 campaign)"
|
||||
|
||||
[phases]
|
||||
phase_1 = { status = "pending", checkpointsha = "", name = "Enable the Test Engine in the App (CLI flag + runner_params + register_tests callback)" }
|
||||
phase_2 = { status = "pending", checkpointsha = "", name = "Build the API Hooks Bridge (4 endpoints + 4 client methods)" }
|
||||
phase_3 = { status = "pending", checkpointsha = "", name = "Live_gui Fixture + Full Smoke Test" }
|
||||
phase_4 = { status = "pending", checkpointsha = "", name = "End-of-Track Report + State Update + User Sign-off" }
|
||||
|
||||
[tasks]
|
||||
# Phase 1: enable the engine
|
||||
t1_1 = { status = "pending", commit_sha = "", description = "Write failing test for --enable-test-engine flag + engine activation (Red phase)" }
|
||||
t1_2 = { status = "pending", commit_sha = "", description = "Add --enable-test-engine CLI flag to sloppy.py + test_engine_enabled field to AppController" }
|
||||
t1_3 = { status = "pending", commit_sha = "", description = "Enable engine in App.run() (runner_params.use_imgui_test_engine = True + callbacks.register_tests = self._register_imgui_tests) + add _register_imgui_tests method with smoke test" }
|
||||
t1_4 = { status = "pending", commit_sha = "", description = "Manual verification: run sloppy.py --enable-test-engine locally; confirm engine activates + no GIL-transfer crash" }
|
||||
# Phase 2: build the bridge
|
||||
t2_1 = { status = "pending", commit_sha = "", description = "Write failing tests for queue_test + wait_for_test_results + get_test_results (Red phase)" }
|
||||
t2_2 = { status = "pending", commit_sha = "", description = "Add 4 /api/test_engine/* endpoints to HookServer (queue, status, results, abort)" }
|
||||
t2_3 = { status = "pending", commit_sha = "", description = "Add 4 new methods to ApiHookClient (queue_test, get_test_status, get_test_results, wait_for_test_results)" }
|
||||
t2_4 = { status = "pending", commit_sha = "", description = "Run Phase 2 tests (Green phase); verify all 3 smoke tests pass" }
|
||||
# Phase 3: live_gui fixture + full smoke test
|
||||
t3_1 = { status = "pending", commit_sha = "", description = "Update live_gui fixture (conftest.py:792) to pass --enable-test-engine" }
|
||||
t3_2 = { status = "pending", commit_sha = "", description = "Run full smoke test + regression batch (Green phase)" }
|
||||
# Phase 4: end-of-track
|
||||
t4_1 = { status = "pending", commit_sha = "", description = "Write docs/reports/TRACK_COMPLETION_test_engine_integration_20260627.md" }
|
||||
t4_2 = { status = "pending", commit_sha = "", description = "Update conductor/tracks.md + chronology.md + state.toml -> status='completed'" }
|
||||
t4_3 = { status = "pending", commit_sha = "", description = "Conductor - User Manual Verification (PAUSE for user sign-off)" }
|
||||
|
||||
[verification]
|
||||
phase_1_complete = false
|
||||
phase_2_complete = false
|
||||
phase_3_complete = false
|
||||
phase_4_complete = false
|
||||
engine_activates_without_crash = false
|
||||
smoke_test_passes = false
|
||||
no_live_gui_regression = false
|
||||
|
||||
[campaign_context]
|
||||
# This is Track 1 of 3. The campaign enables high-fidelity UI simulation via the
|
||||
# Dear ImGui Test Engine, bridged through the existing API hooks layer.
|
||||
campaign_name = "Test Engine Campaign"
|
||||
track_1 = "test_engine_integration_20260627 (THIS; bridge + smoke test)"
|
||||
track_2 = "test_engine_docking_tests (migrate docking/focus/panel tests)"
|
||||
track_3 = "test_engine_capture_regression (visual regression via screenshot capture)"
|
||||
key_risk = "R1: GIL-transfer crash if the app's thread layout doesn't work with the engine's scenario thread (mitigated by Phase 1 Task 1.4 manual checkpoint)"
|
||||
@@ -0,0 +1,84 @@
|
||||
{
|
||||
"track_id": "video_analysis_campaign_2_20260627",
|
||||
"name": "Video Analysis Campaign 2 (4 AI Videos, 3-Pass)",
|
||||
"status": "active",
|
||||
"branch": "master",
|
||||
"created": "2026-06-27",
|
||||
"owner": "Tier 1 (initialized); implementation delegated to Tier 2/3.",
|
||||
"blocked_by": [],
|
||||
"blocks": ["video_analysis_2_pass_2_deob (future)", "video_analysis_2_pass_3_projection (future)"],
|
||||
"scope": {
|
||||
"new_files": [
|
||||
"conductor/tracks/video_analysis_2_entropy_compression_20260627/ (child; Pass 1 report + artifacts)",
|
||||
"conductor/tracks/video_analysis_2_lecun_world_models_20260627/ (child)",
|
||||
"conductor/tracks/video_analysis_2_lecun_bet_against_llms_20260627/ (child)",
|
||||
"conductor/tracks/video_analysis_2_recursive_self_improvement_20260627/ (child)",
|
||||
"conductor/tracks/video_analysis_2_synthesis_20260627/ (child; cross-video synthesis)",
|
||||
"docs/reports/TRACK_COMPLETION_video_analysis_campaign_2_20260627.md (end-of-campaign closeout)"
|
||||
],
|
||||
"modified_files": [],
|
||||
"deleted_files": []
|
||||
},
|
||||
"estimated_effort": {
|
||||
"method": "scope (per workflow.md Tier 1 Track Initialization Rules. NO day estimates.)",
|
||||
"phase_0": "3 steps: verify pipeline + scaffold child tracks + commit",
|
||||
"phase_1": "5 steps: 4 per-video Pass 1 reports + commit",
|
||||
"phase_2": "2 steps: synthesis report + commit",
|
||||
"phase_3": "3 steps: verify + user review gate + checkpoint commit"
|
||||
},
|
||||
"verification_criteria": [
|
||||
"G1: 4 Pass 1 deep-dive reports exist, each >=1,000 LOC",
|
||||
"G2: Pass 2 deobfuscation applied to all 4 (future sub-track; not part of this plan)",
|
||||
"G3: Pass 3 C11/Python projection for all 4 (future sub-track; not part of this plan)",
|
||||
"G4: Cross-video synthesis report exists, connecting the 4 reports + Campaign A insights",
|
||||
"G5: End-of-campaign closeout report exists"
|
||||
],
|
||||
"regressions_and_pre_existing_failures": [],
|
||||
"pre_existing_failures_remaining": [],
|
||||
"deferred_to_followup_tracks": [
|
||||
{
|
||||
"title": "Pass 2: Deobfuscation",
|
||||
"description": "Apply lexicon v2 to all 4 videos. May produce lexicon v3 corrections if new notation surfaces (JEPA, bootstrapping).",
|
||||
"track_status": "not yet initialized; authored after Pass 1 ships"
|
||||
},
|
||||
{
|
||||
"title": "Pass 3: C11/Python Projection",
|
||||
"description": "Project each video's deobfuscated content to C11 or Python code in the user's idiomatic style.",
|
||||
"track_status": "not yet initialized; authored after Pass 2 ships"
|
||||
},
|
||||
{
|
||||
"title": "Lexicon v3 patch (conditional)",
|
||||
"description": "Only if the 4 new videos surface notation the lexicon v2 doesn't cover.",
|
||||
"track_status": "conditional; depends on Pass 2 findings"
|
||||
}
|
||||
],
|
||||
"risk_register": [
|
||||
{
|
||||
"id": "R1",
|
||||
"description": "yt-dlp fails for one or more videos (oEmbed 401 or geo-restriction)",
|
||||
"likelihood": "low",
|
||||
"impact": "Pass 1 report for that video cannot be produced via the pipeline",
|
||||
"mitigation": "the prior campaign had 2 oEmbed failures but yt-dlp still worked; if yt-dlp fails, alternative acquisition (manual download, alternative URL) is a manual fallback"
|
||||
},
|
||||
{
|
||||
"id": "R2",
|
||||
"description": "Video transcripts are low quality (auto-generated, no punctuation)",
|
||||
"likelihood": "medium",
|
||||
"impact": "Pass 1 report quality is degraded; Pass 2 deobfuscation has less to work with",
|
||||
"mitigation": "the pipeline's OCR step supplements the transcript with keyframe text; if both are low quality, manual transcript correction is a user action"
|
||||
},
|
||||
{
|
||||
"id": "R3",
|
||||
"description": "Lexicon v2 doesn't cover new notation (JEPA, bootstrapping, world-model latent dynamics)",
|
||||
"likelihood": "medium",
|
||||
"impact": "Pass 2 deobfuscation produces gaps; a lexicon v3 patch track is needed",
|
||||
"mitigation": "the v2 patch track precedent (video_analysis_deob_lexicon_v2_20260623) shows the correction process works; a v3 patch is a known pattern"
|
||||
}
|
||||
],
|
||||
"campaign_context": {
|
||||
"campaign_name": "Video Analysis Campaign 2",
|
||||
"prior_campaign": "video_analysis_campaign_20260621 (12 videos; closed 2026-06-23)",
|
||||
"sibling_campaign": "Directive Encoding Campaign (Campaign A; directive_hotswap_harness_20260627)",
|
||||
"cross_campaign_relationship": "Intellectual cross-pollination. Video 1 (entropy/compression) is most directly relevant to directive encoding. Videos 2-3 (LeCun) inform whether directive encoding should account for non-autoregressive architectures. Video 4 (recursive self-improvement) is the meta-question the directive harness addresses."
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,155 @@
|
||||
# Plan: Video Analysis Campaign 2 (4 AI Videos, 3-Pass)
|
||||
|
||||
Track: `video_analysis_campaign_2_20260627`
|
||||
Branch: master (research track; no code changes, no test changes — pure analysis + reports)
|
||||
Spec: `conductor/tracks/video_analysis_campaign_2_20260627/spec.md`
|
||||
|
||||
This is an umbrella track. The plan covers Phase 0 (umbrella setup) + Phase 1 (Pass 1 information extraction for 4 videos). Pass 2 + Pass 3 plans are authored as sub-tracks once Pass 1 ships.
|
||||
|
||||
---
|
||||
|
||||
## Phase 0: Umbrella Setup
|
||||
|
||||
Focus: Verify the pipeline works for the 4 new videos; scaffold the child track directories.
|
||||
|
||||
- [ ] **Step 0.1: Verify the video acquisition pipeline works for all 4 videos**
|
||||
|
||||
**WHAT:** Run `scripts/video_analysis/download_video.py` for each of the 4 URLs. Verify the videos download successfully via `yt-dlp`. Some videos may fail oEmbed (as the prior campaign experienced with 2 E-cluster videos); `yt-dlp` may still work.
|
||||
|
||||
**HOW:**
|
||||
```bash
|
||||
uv run python -m scripts.video_analysis.download_video "https://youtu.be/l6DKRf-fAAM" --slug entropy_compression
|
||||
uv run python -m scripts.video_analysis.download_video "https://www.youtube.com/watch?v=72Xj8k5WQX4" --slug lecun_world_models
|
||||
uv run python -m scripts.video_analysis.download_video "https://youtu.be/kYkIdXwW2AE" --slug lecun_bet_against_llms
|
||||
uv run python -m scripts.video_analysis.download_video "https://youtu.be/t7_ZXgfJVG8" --slug recursive_self_improvement
|
||||
```
|
||||
|
||||
**VERIFY:** 4 video files downloaded. If any fail, document the failure + alternative acquisition method.
|
||||
|
||||
- [ ] **Step 0.2: Scaffold the 4 child track directories**
|
||||
|
||||
**WHERE:** `conductor/tracks/video_analysis_2_<slug>_20260627/` (4 directories)
|
||||
|
||||
**WHAT:** Create the directories with placeholder spec.md + state.toml files. Each child track is a Pass 1 report producer.
|
||||
|
||||
- [ ] **Step 0.3: Commit the umbrella setup**
|
||||
|
||||
```bash
|
||||
git add conductor/tracks/video_analysis_campaign_2_20260627/ conductor/tracks/video_analysis_2_*/
|
||||
git commit -m "conductor(track): scaffold video_analysis_campaign_2_20260627 (umbrella + 4 children)"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Phase 1: Pass 1 — Information Extraction (4 Videos)
|
||||
|
||||
Focus: Produce 4 deep-dive reports using the existing pipeline. Each video is a child track executed independently.
|
||||
|
||||
- [ ] **Step 1.1: Video 1 — entropy_compression (Reinventing Entropy | Compression is Intelligence Part 1)**
|
||||
|
||||
**URL:** https://youtu.be/l6DKRf-fAAM
|
||||
**Slug:** `entropy_compression`
|
||||
**Cluster:** A (compression/entropy)
|
||||
**Child track:** `conductor/tracks/video_analysis_2_entropy_compression_20260627/`
|
||||
|
||||
**Steps:**
|
||||
1. Download video (if not already done in Phase 0).
|
||||
2. Extract transcript via `scripts/video_analysis/extract_transcript.py`.
|
||||
3. Extract keyframes via `scripts/video_analysis/extract_keyframes.py`.
|
||||
4. OCR keyframes via `scripts/video_analysis/ocr_frames.py`.
|
||||
5. Synthesize report via `scripts/video_analysis/synthesize_report.py`.
|
||||
6. Write `report.md` (1,000-10,000 LOC) — lossless preservation of the video's content.
|
||||
|
||||
**Expected content:** Shannon entropy, Kolmogorov complexity, compression as intelligence, the relationship between compression and prediction. This video is the most directly relevant to Campaign A (directive encoding = compression of instructions).
|
||||
|
||||
- [ ] **Step 1.2: Video 2 — lecun_world_models (Yann LeCun: World Models)**
|
||||
|
||||
**URL:** https://www.youtube.com/watch?v=72Xj8k5WQX4
|
||||
**Slug:** `lecun_world_models`
|
||||
**Cluster:** B (world models)
|
||||
**Child track:** `conductor/tracks/video_analysis_2_lecun_world_models_20260627/`
|
||||
|
||||
**Steps:** Same pipeline as Step 1.1.
|
||||
|
||||
**Expected content:** LeCun's world model architecture, JEPA (Joint Embedding Predictive Architecture), planning via latent dynamics, the distinction between generative models and predictive models. Relevant to how LLMs model directive intent.
|
||||
|
||||
- [ ] **Step 1.3: Video 3 — lecun_bet_against_llms (LeCun's $1B Bet Against LLMs [Part 1])**
|
||||
|
||||
**URL:** https://youtu.be/kYkIdXwW2AE
|
||||
**Slug:** `lecun_bet_against_llms`
|
||||
**Cluster:** B (world models)
|
||||
**Child track:** `conductor/tracks/video_analysis_2_lecun_bet_against_llms_20260627/`
|
||||
|
||||
**Steps:** Same pipeline.
|
||||
|
||||
**Expected content:** LeCun's critique of LLMs, autoregressive limitations, the path toward reasoning systems, world models as the next AI revolution. Relevant to whether directive encoding is about pattern-matching (LLM) or reasoning (world model).
|
||||
|
||||
- [ ] **Step 1.4: Video 4 — recursive_self_improvement (Recursive Self-Improvement)**
|
||||
|
||||
**URL:** https://youtu.be/t7_ZXgfJVG8
|
||||
**Slug:** `recursive_self_improvement`
|
||||
**Cluster:** C (meta-AI)
|
||||
**Child track:** `conductor/tracks/video_analysis_2_recursive_self_improvement_20260627/`
|
||||
|
||||
**Steps:** Same pipeline.
|
||||
|
||||
**Expected content:** Recursive self-improvement, alignment, bootstrapping intelligence. The meta-question: can better directive encodings be discovered iteratively? The directive hot-swap harness IS a recursive self-improvement tool for directive encoding.
|
||||
|
||||
- [ ] **Step 1.5: Commit Pass 1 reports**
|
||||
|
||||
```bash
|
||||
git add conductor/tracks/video_analysis_2_*/report.md
|
||||
git commit -m "feat(video_analysis): Pass 1 complete — 4 deep-dive reports (entropy_compression, lecun_world_models, lecun_bet_against_llms, recursive_self_improvement)"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Phase 2: Cross-Video Synthesis (Pass 1)
|
||||
|
||||
Focus: Connect the 4 reports to each other and to the prior campaign's themes.
|
||||
|
||||
- [ ] **Step 2.1: Write the synthesis report**
|
||||
|
||||
**WHERE:** `conductor/tracks/video_analysis_2_synthesis_20260627/report.md`
|
||||
|
||||
**WHAT:**
|
||||
- Theme matrix: which videos touch which themes (compression, world models, self-improvement, directive encoding).
|
||||
- Concept map: how the 4 videos' concepts relate.
|
||||
- Connection to the prior campaign: which of the 12 prior videos share themes with these 4 new ones (especially `entropy_epiplexity` for video 1, `cs229_building_llms` for videos 2-3).
|
||||
- Cross-campaign insights: what the video analysis suggests for Campaign A (directive encoding). Specifically: does the entropy/compression video suggest a principled way to measure directive encoding efficiency? Do LeCun's world-model ideas suggest directive encoding should account for non-autoregressive architectures?
|
||||
|
||||
- [ ] **Step 2.2: Commit the synthesis**
|
||||
|
||||
```bash
|
||||
git add conductor/tracks/video_analysis_2_synthesis_20260627/
|
||||
git commit -m "feat(video_analysis): Pass 1 synthesis — 4-video cross-reference + Campaign A insights"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Phase 3: End-of-Pass-1 Checkpoint
|
||||
|
||||
Focus: Verify Pass 1 is complete; gate Pass 2 (deobfuscation).
|
||||
|
||||
- [ ] **Step 3.1: Verify all 4 reports exist + meet the LOC threshold**
|
||||
|
||||
```bash
|
||||
for f in conductor/tracks/video_analysis_2_*/report.md; do
|
||||
wc -l "$f"
|
||||
done
|
||||
```
|
||||
|
||||
Expected: 4 files, each ≥1,000 LOC.
|
||||
|
||||
- [ ] **Step 3.2: Present Pass 1 results to the user**
|
||||
|
||||
Report: 4 reports produced, synthesis produced, key themes identified. PAUSE for user review before Pass 2 begins.
|
||||
|
||||
**Pass 2 (deobfuscation) and Pass 3 (C11/Python projection) plans are authored as sub-tracks once Pass 1 is approved by the user.** The user may need to gather deobfuscation samples (same as the prior campaign's warmup) before Pass 2 starts.
|
||||
|
||||
- [ ] **Step 3.3: Commit the checkpoint**
|
||||
|
||||
```bash
|
||||
git add conductor/tracks/video_analysis_campaign_2_20260627/state.toml
|
||||
git commit -m "conductor(checkpoint): video_analysis_campaign_2 Pass 1 complete — awaiting user review before Pass 2"
|
||||
```
|
||||
@@ -0,0 +1,142 @@
|
||||
# Track Specification: Video Analysis Campaign 2 (4 AI Videos, 3-Pass)
|
||||
|
||||
## Overview
|
||||
|
||||
A research campaign analyzing 4 new AI-related YouTube videos using the established 3-pass architecture from the previous 12-video campaign (Pass 1: extract → Pass 2: deobfuscate → Pass 3: project to C11/Python). The campaign reuses the existing lexicon v2 + C11 reference from the prior campaign.
|
||||
|
||||
The 4 videos share a theme — compression, entropy, world models, and recursive self-improvement — that is directly relevant to the directive-encoding research in Campaign A (the directive hot-swap harness). The two campaigns are siblings: intellectual cross-pollination, no hard dependency, can run in parallel.
|
||||
|
||||
**This spec covers the umbrella track.** The per-video child tracks (Pass 1 reports) and the deobfuscation sub-tracks (Pass 2 + Pass 3) are initialized as children once the umbrella is approved.
|
||||
|
||||
## The 4 Videos
|
||||
|
||||
| # | Title | URL | Cluster | Topic |
|
||||
|---|---|---|---|---|
|
||||
| 1 | Reinventing Entropy \| Compression is Intelligence Part 1 | https://youtu.be/l6DKRf-fAAM | A (compression/entropy) | Shannon entropy, compression as intelligence, Kolmogorov complexity |
|
||||
| 2 | Yann LeCun: World Models: Enabling the next AI revolution | https://www.youtube.com/watch?v=72Xj8k5WQX4 | B (world models) | LeCun's world model architecture; JEPA; planning via latent dynamics |
|
||||
| 3 | Yann LeCun's $1B Bet Against LLMs [Part 1] | https://youtu.be/kYkIdXwW2AE | B (world models) | LeCun's critique of LLMs; autoregressive limitations; path toward reasoning |
|
||||
| 4 | Recursive Self-Improvement | https://youtu.be/t7_ZXgfJVG8 | C (meta-AI) | Recursive self-improvement; alignment; bootstrapping intelligence |
|
||||
|
||||
**Cluster assignment:**
|
||||
- **A (compression/entropy):** video 1 — directly relevant to the directive-encoding question (how do you compress information for an LLM?)
|
||||
- **B (world models):** videos 2-3 — LeCun's world-model work informs how LLMs model directive intent and whether alternative architectures change the encoding question
|
||||
- **C (meta-AI):** video 4 — recursive self-improvement is the meta-question of whether better directive encodings can be discovered iteratively
|
||||
|
||||
## Current State Audit (as of master `03c7cfd5`)
|
||||
|
||||
### Already Implemented (from the prior campaign — DO NOT re-implement)
|
||||
|
||||
- **`scripts/video_analysis/` pipeline** (7 modules): `download_video.py`, `extract_transcript.py`, `extract_keyframes.py`, `ocr_frames.py`, `synthesize_report.py`, `error_types.py`, `__init__.py`. These are the reusable tooling from the prior campaign. Pass 1 reuses them directly.
|
||||
|
||||
- **Lexicon v2** (from `video_analysis_deob_lexicon_v2_20260623`): the codified deobfuscation spec with 76 terms, the 5 load-bearing rules (Boundedness, Form-anchor, Etymology, Lossless, Encoding-explicit), the constructive type-theoretic foundation, and the per-language `<<` / `>>` rendering. Pass 2 starts from v2; may produce v3 corrections if the new videos surface notation the lexicon doesn't cover.
|
||||
|
||||
- **C11 reference** (from `video_analysis_deob_c11_reference_20260623`): the user's idiomatic C11 style (byte-width types, underscore-suffixed modifiers, hand-rolled DSL, memory ordering vocabulary, slice + arena, design-doc headers). Pass 3 uses this as the projection target.
|
||||
|
||||
- **Pass 3 projection pattern** (from `video_analysis_deob_c11_reference_20260623` + `pass_3_c11_python_projection_20260623`): per-video deliverables = C11 (.c + .h) or Python (.py) + 3-4 markdown docs (translation, decoder, notes). 4 + 3 verification criteria per the v2 lexicon.
|
||||
|
||||
- **The 3-pass architecture** (documented in `docs/reports/2026-06-15/CAMPAIGN_CLOSE_OUT_video_analysis_20260621.md`): Pass 1 captures raw content losslessly; Pass 2 applies the lexicon; Pass 3 projects to code. The v2 patch + C11 reference are sub-tracks between Pass 2 and Pass 3.
|
||||
|
||||
### Gaps to Fill (This Campaign's Scope)
|
||||
|
||||
- **GAP-1: No Pass 1 reports for the 4 new videos.** The prior campaign analyzed 12 videos; these 4 are new. Pass 1 produces 4 deep-dive reports (one per video) using the existing pipeline.
|
||||
|
||||
- **GAP-2: No Pass 2 deobfuscation for the 4 new videos.** The lexicon v2 must be applied to each video's content. May produce lexicon v3 corrections if the new videos surface notation the lexicon doesn't cover (e.g., LeCun's JEPA terminology, recursive self-improvement's bootstrapping notation).
|
||||
|
||||
- **GAP-3: No Pass 3 C11/Python projection for the 4 new videos.** Each video's deobfuscated content must be projected to C11 or Python code in the user's idiomatic style.
|
||||
|
||||
- **GAP-4: No cross-video synthesis.** The prior campaign had a synthesis track (`video_analysis_synthesis_20260621`) that cross-referenced the 12 reports. This campaign should produce a synthesis cross-referencing the 4 new reports + connecting to the prior campaign's themes.
|
||||
|
||||
### Relationship to Campaign A (Directive Hot-Swap Harness)
|
||||
|
||||
The two campaigns share a theme ("how do you encode information densely for an LLM?") but are tracked and executed independently:
|
||||
|
||||
- **Video 1 (entropy/compression)** provides theoretical grounding for information density. The directive-encoding question IS a compression question: what is the minimal token-cost encoding of a directive that maintains LLM compliance?
|
||||
- **Videos 2-3 (LeCun world models)** inform how LLMs model directive intent. If LLMs are autoregressive pattern-matchers (LeCun's critique), then directive encoding is about pattern-matching, not reasoning. If world models are the path forward, directive encoding may need to account for non-autoregressive architectures.
|
||||
- **Video 4 (recursive self-improvement)** is the meta-question: can better directive encodings be discovered iteratively? The directive hot-swap harness IS a recursive self-improvement tool for directive encoding.
|
||||
|
||||
Insights from the video analysis may surface alternative encoding strategies to test in Campaign A's harness. The harness's design (preset as bill-of-materials, variant as alternative encoding) mirrors the video campaign's deobfuscation pass (same content, different encoding).
|
||||
|
||||
## Goals
|
||||
|
||||
- **G1.** 4 Pass 1 deep-dive reports (one per video, 1,000-10,000 LOC each) produced via the existing `scripts/video_analysis/` pipeline.
|
||||
- **G2.** Pass 2 deobfuscation applied to all 4 videos using lexicon v2. Lexicon v3 corrections produced if the new videos surface notation the lexicon doesn't cover.
|
||||
- **G3.** Pass 3 C11/Python projection for all 4 videos (per-video deliverables: C11 .c + .h or Python .py + 3-4 markdown docs).
|
||||
- **G4.** A cross-video synthesis report connecting the 4 new reports to each other and to the prior campaign's themes.
|
||||
- **G5.** End-of-campaign closeout report documenting what was done, key insights, and any cross-campaign insights relevant to Campaign A.
|
||||
|
||||
## Functional Requirements
|
||||
|
||||
### FR1: Pass 1 — Information Extraction
|
||||
|
||||
- Use `scripts/video_analysis/download_video.py` to acquire each video via `yt-dlp`.
|
||||
- Use `scripts/video_analysis/extract_transcript.py` to extract the transcript.
|
||||
- Use `scripts/video_analysis/extract_keyframes.py` + `scripts/video_analysis/ocr_frames.py` to extract keyframe images + OCR text.
|
||||
- Use `scripts/video_analysis/synthesize_report.py` to synthesize the deep-dive report.
|
||||
- Each report preserves the source content losslessly (no deobfuscation yet — that's Pass 2).
|
||||
- Per-video deliverable: `report.md` (1,000-10,000 LOC) + supporting artifacts (transcript, keyframes, OCR).
|
||||
- **Video slug naming:** `entropy_compression` (video 1), `lecun_world_models` (video 2), `lecun_bet_against_llms` (video 3), `recursive_self_improvement` (video 4).
|
||||
|
||||
### FR2: Pass 2 — Deobfuscation
|
||||
|
||||
- Apply lexicon v2 to each video's Pass 1 report.
|
||||
- Per-video deliverables: translation (3-column: original → deobfuscated → rationale) + replacement (the deobfuscated content) + decoder (the notation mapping).
|
||||
- 4 + 4 verification criteria per the v2 lexicon (lossless, bounded, constructively typed, etymology-cited + the 4 additional from the apply phase).
|
||||
- If a video surfaces notation the lexicon doesn't cover: produce lexicon v3 corrections (L-codes) + update `terms_catalog.md`.
|
||||
- **Expected new notation:** LeCun's JEPA (Joint Embedding Predictive Architecture), the world-model latent dynamics vocabulary, recursive self-improvement's bootstrapping notation.
|
||||
|
||||
### FR3: Pass 3 — C11/Python Projection
|
||||
|
||||
- Project each video's deobfuscated content to C11 (.c + .h) or Python (.py) in the user's idiomatic style.
|
||||
- Use the C11 reference (`video_analysis_deob_c11_reference_20260623`) as the style guide.
|
||||
- Per-video deliverables: C11 or Python code + 3-4 markdown docs (translation, decoder, notes).
|
||||
- Per-language `<<` / `>>` rendering (much_less / much_greater / weakly_coupled with tolerance).
|
||||
- Encoding placeholder scheme (float / integer / Scalar / float64).
|
||||
- Code may or may not run (per user 2026-06-23: "code may or may not run").
|
||||
|
||||
### FR4: Cross-Video Synthesis
|
||||
|
||||
- A synthesis report connecting the 4 new reports to each other.
|
||||
- Theme matrix: which videos touch which themes (compression, world models, self-improvement, directive encoding).
|
||||
- Concept map: how the 4 videos' concepts relate.
|
||||
- Connection to the prior campaign: which of the 12 prior videos share themes with these 4 new ones.
|
||||
- Cross-campaign insights: any insights relevant to Campaign A (directive encoding).
|
||||
|
||||
### FR5: End-of-Campaign Closeout
|
||||
|
||||
- A closeout report following the precedent of `docs/reports/2026-06-15/CAMPAIGN_CLOSE_OUT_video_analysis_20260621.md`.
|
||||
- Documents: what was done, key decisions, final statistics, open questions.
|
||||
- Cross-campaign insights: what the video analysis suggests for directive encoding (Campaign A).
|
||||
|
||||
## Non-Functional Requirements
|
||||
|
||||
- **Lossless preservation:** Pass 1 artifacts must NOT be over-summarized (data cascades to Pass 2/3). Per the prior campaign's "load-bearing directive."
|
||||
- **Lexicon v2 as starting point:** Pass 2 starts from v2. If v3 corrections are needed, they are produced as a patch track (same pattern as `video_analysis_deob_lexicon_v2_20260623`).
|
||||
- **User-led gating:** Pass 2 may require the user to gather deobfuscation samples (same as the prior campaign's warmup). Pass 3 may require the user to articulate "own caveats" before the projection starts. These are user-action gates, not agent-action gates.
|
||||
- **Reusable tooling:** the existing `scripts/video_analysis/` pipeline is reused without modification. If the pipeline needs changes (e.g., new ocr engine, new transcript API), that's a separate tooling track.
|
||||
|
||||
## Architecture Reference
|
||||
|
||||
- **`docs/reports/2026-06-15/CAMPAIGN_CLOSE_OUT_video_analysis_20260621.md`** — the prior campaign's closeout (the pattern this campaign follows).
|
||||
- **`scripts/video_analysis/`** — the existing pipeline (7 modules; reused for Pass 1).
|
||||
- **The lexicon v2** (from `video_analysis_deob_lexicon_v2_20260623`) — the deobfuscation substrate for Pass 2.
|
||||
- **The C11 reference** (from `video_analysis_deob_c11_reference_20260623`) — the projection target for Pass 3.
|
||||
- **`docs/superpowers/specs/2026-06-27-directive-hotswap-harness-design.md`** → now at `conductor/tracks/directive_hotswap_harness_20260627/spec.md` — the sibling campaign (Campaign A).
|
||||
|
||||
## Out of Scope
|
||||
|
||||
- **Modifying the existing `scripts/video_analysis/` pipeline.** If the pipeline needs changes, that's a separate tooling track.
|
||||
- **Re-analyzing the 12 prior videos.** The prior campaign is closed.
|
||||
- **Building the directive hot-swap harness.** That's Campaign A (separate track, separate spec).
|
||||
- **Authoring alternative directive encodings (v2+).** That's a future track in Campaign A.
|
||||
- **Automated compliance testing of directive encodings.** Future track.
|
||||
|
||||
## Track Structure (Children)
|
||||
|
||||
This is the umbrella track. Children are initialized once the umbrella is approved:
|
||||
|
||||
- **Pass 1 children (4):** `video_analysis_2_entropy_compression_20260627`, `video_analysis_2_lecun_world_models_20260627`, `video_analysis_2_lecun_bet_against_llms_20260627`, `video_analysis_2_recursive_self_improvement_20260627`
|
||||
- **Pass 1 synthesis (1):** `video_analysis_2_synthesis_20260627`
|
||||
- **Pass 2 sub-tracks (TBD):** umbrella + warmup (if needed) + apply. Initialized after Pass 1 ships.
|
||||
- **Pass 3 sub-tracks (TBD):** initialized after Pass 2 ships.
|
||||
- **Lexicon v3 patch (conditional):** only if the new videos surface notation the lexicon doesn't cover.
|
||||
- **End-of-campaign closeout (1):** `video_analysis_campaign_2_closeout_20260627`
|
||||
@@ -0,0 +1,58 @@
|
||||
# Track state for video_analysis_campaign_2_20260627
|
||||
# Initialized by Tier 1 Orchestrator on 2026-06-27.
|
||||
# Umbrella track for the 4-video research campaign (Pass 1 only; Pass 2/3 are sub-tracks).
|
||||
|
||||
[meta]
|
||||
track_id = "video_analysis_campaign_2_20260627"
|
||||
name = "Video Analysis Campaign 2 (4 AI Videos, 3-Pass)"
|
||||
status = "active"
|
||||
current_phase = 0
|
||||
last_updated = "2026-06-27"
|
||||
|
||||
[blocked_by]
|
||||
# None. Research track; no code changes, no test changes.
|
||||
|
||||
[blocks]
|
||||
video_analysis_2_pass_2_deob = "planned (future; authored after Pass 1 ships)"
|
||||
video_analysis_2_pass_3_projection = "planned (future; authored after Pass 2 ships)"
|
||||
|
||||
[phases]
|
||||
phase_0 = { status = "pending", checkpointsha = "", name = "Umbrella Setup (verify pipeline + scaffold child tracks)" }
|
||||
phase_1 = { status = "pending", checkpointsha = "", name = "Pass 1 — Information Extraction (4 per-video reports)" }
|
||||
phase_2 = { status = "pending", checkpointsha = "", name = "Cross-Video Synthesis (Pass 1)" }
|
||||
phase_3 = { status = "pending", checkpointsha = "", name = "End-of-Pass-1 Checkpoint (verify + user review gate)" }
|
||||
|
||||
[tasks]
|
||||
t0_1 = { status = "pending", commit_sha = "", description = "Verify yt-dlp pipeline works for all 4 URLs" }
|
||||
t0_2 = { status = "pending", commit_sha = "", description = "Scaffold 4 child track directories + synthesis child" }
|
||||
t0_3 = { status = "pending", commit_sha = "", description = "Commit umbrella setup" }
|
||||
t1_1 = { status = "pending", commit_sha = "", description = "Video 1: entropy_compression (Reinventing Entropy | Compression is Intelligence Part 1)" }
|
||||
t1_2 = { status = "pending", commit_sha = "", description = "Video 2: lecun_world_models (Yann LeCun: World Models)" }
|
||||
t1_3 = { status = "pending", commit_sha = "", description = "Video 3: lecun_bet_against_llms (LeCun's $1B Bet Against LLMs [Part 1])" }
|
||||
t1_4 = { status = "pending", commit_sha = "", description = "Video 4: recursive_self_improvement (Recursive Self-Improvement)" }
|
||||
t1_5 = { status = "pending", commit_sha = "", description = "Commit Pass 1 reports" }
|
||||
t2_1 = { status = "pending", commit_sha = "", description = "Write cross-video synthesis report (theme matrix + concept map + Campaign A insights)" }
|
||||
t2_2 = { status = "pending", commit_sha = "", description = "Commit synthesis" }
|
||||
t3_1 = { status = "pending", commit_sha = "", description = "Verify all 4 reports >= 1,000 LOC" }
|
||||
t3_2 = { status = "pending", commit_sha = "", description = "Present Pass 1 results to user (PAUSE for review before Pass 2)" }
|
||||
t3_3 = { status = "pending", commit_sha = "", description = "Commit checkpoint" }
|
||||
|
||||
[verification]
|
||||
phase_0_complete = false
|
||||
phase_1_complete = false
|
||||
phase_2_complete = false
|
||||
phase_3_complete = false
|
||||
pass_1_reports_count = 0
|
||||
synthesis_complete = false
|
||||
|
||||
[campaign_context]
|
||||
campaign_name = "Video Analysis Campaign 2"
|
||||
prior_campaign = "video_analysis_campaign_20260621 (12 videos; closed 2026-06-23)"
|
||||
sibling_campaign = "Directive Encoding Campaign (Campaign A; directive_hotswap_harness_20260627)"
|
||||
cross_campaign_relationship = "Intellectual cross-pollination. Video 1 (entropy/compression) is most relevant to directive encoding."
|
||||
videos = [
|
||||
{ slug = "entropy_compression", url = "https://youtu.be/l6DKRf-fAAM", cluster = "A" },
|
||||
{ slug = "lecun_world_models", url = "https://www.youtube.com/watch?v=72Xj8k5WQX4", cluster = "B" },
|
||||
{ slug = "lecun_bet_against_llms", url = "https://youtu.be/kYkIdXwW2AE", cluster = "B" },
|
||||
{ slug = "recursive_self_improvement", url = "https://youtu.be/t7_ZXgfJVG8", cluster = "C" },
|
||||
]
|
||||
@@ -383,11 +383,13 @@ The Tier 2 autonomous mode is the unattended execution mode for tracks. See `doc
|
||||
### Conventions (MUST follow)
|
||||
|
||||
1. **Test runner:** Tier 2 always uses `uv run python scripts/run_tests_batched.py`. NEVER `uv run pytest` directly. The batched runner provides tier-based filtering, parallelization (xdist), and a summary table that direct pytest does not.
|
||||
2. **Default branch:** this repo uses `master` (not `main`). When fetching or branching, use `origin/master`. Do not assume `main` exists.
|
||||
3. **Line endings:** preserve existing line endings on edit. This repo has a mix of CRLF and LF; repo-wide LF standardization is a future track. For now, do not normalize.
|
||||
4. **Throw-away scripts:** Tier 2 writes its working scripts to `scripts/tier2/artifacts/<track-name>/`, NOT the base `scripts/tier2/` directory. The base is reserved for production code (failcount.py, run_track.py, write_report.py, the .ps1 launchers). Throw-away scripts are kept for archival but isolated.
|
||||
5. **End-of-track report:** at the end of every track, Tier 2 writes `docs/reports/TRACK_COMPLETION_<track-name>.md` (follow the precedent set by `TRACK_COMPLETION_tier2_autonomous_sandbox_20260616.md`) and updates `conductor/tracks/<track-name>/state.toml` to `status = "completed"`. The user reads this report to decide merge.
|
||||
6. **Run-time expectation:** tracks are 1-4 hours. If the model reports it is running out of context, Tier 2 notes progress to disk (the failcount state file) and continues. The user expects autonomous runs to complete without manual "press continue" intervention. The `--resume` flag picks up from the last completed task.
|
||||
2. **NEVER filter test output** (added 2026-06-27 per user directive). Do NOT pipe test output through `Select-Object`, `| Select -First N`, `| Select -Last N`, `head`, `tail`, or any truncation filter. If you need to see more output later, you'll have to re-run the entire test — which wastes time and context. Instead, ALWAYS redirect to a log file: `uv run python scripts/run_tests_batched.py > tests/artifacts/tier2_state/<track>/test_run_<phase>_<task>.log 2>&1`. Then read the log file with `manual-slop_read_file` or `grep` to find the relevant sections. The log file is your full record; you can search it without re-running.
|
||||
3. **Prefer targeted tier runs** (added 2026-06-27 per user directive). Do NOT run the full 11-tier batch for every verification. Run only the tiers relevant to the current task (e.g., `--tier tier3` or `--filter test_<specific_file>`). The full batch is for the USER to run after merge review, not for Tier 2's per-task verification. Running the full batch every time wastes 20+ minutes and the output is too large to be useful in context.
|
||||
4. **Default branch:** this repo uses `master` (not `main`). When fetching or branching, use `origin/master`. Do not assume `main` exists.
|
||||
5. **Line endings:** preserve existing line endings on edit. This repo has a mix of CRLF and LF; repo-wide LF standardization is a future track. For now, do not normalize.
|
||||
6. **Throw-away scripts:** Tier 2 writes its working scripts to `scripts/tier2/artifacts/<track-name>/`, NOT the base `scripts/tier2/` directory. The base is reserved for production code (failcount.py, run_track.py, write_report.py, the .ps1 launchers). Throw-away scripts are kept for archival but isolated.
|
||||
7. **End-of-track report:** at the end of every track, Tier 2 writes `docs/reports/TRACK_COMPLETION_<track-name>.md` (follow the precedent set by `TRACK_COMPLETION_tier2_autonomous_sandbox_20260616.md`) and updates `conductor/tracks/<track-name>/state.toml` to `status = "completed"`. The user reads this report to decide merge.
|
||||
8. **Run-time expectation:** tracks are 1-4 hours. If the model reports it is running out of context, Tier 2 notes progress to disk (the failcount state file) and continues. The user expects autonomous runs to complete without manual "press continue" intervention. The `--resume` flag picks up from the last completed task.
|
||||
|
||||
### Hard bans (3-layer enforcement)
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user