"""Audit for tier-2 sandbox-only files leaking into the main repo. Defense-in-depth layer 3 (after the pre-commit hook at the commit boundary): scans the working tree for files matching the forbidden patterns in conductor/tier2/githooks/forbidden-files.txt. If any match, the file is reported as a leak. Usage: uv run python scripts/audit_tier2_leaks.py # informational uv run python scripts/audit_tier2_leaks.py --strict # CI gate (exit 1) uv run python scripts/audit_tier2_leaks.py --json # machine-readable Behavior: - Walks the working tree, skipping .git/, node_modules/, and __pycache__/ (anything git would ignore at the build level) - For each candidate file, checks if its relative path contains any forbidden pattern as a substring - Reports each leak with its path and status (untracked/modified) - Default mode exits 0; --strict mode exits 1 if any leaks This script is the manual/CI guard. The pre-commit hook at conductor/tier2/githooks/pre-commit is the live guard; both layers must be present for the defense-in-depth contract to hold. """ import argparse import json import subprocess import sys from pathlib import Path CONFIG_REL = Path("conductor/tier2/githooks/forbidden-files.txt") SKIP_DIRS = {".git", "node_modules", "__pycache__", ".venv", "venv"} # Test infrastructure and the canonical source directory for tier-2 # files. Tests/ and conductor/tier2/ are project-controlled, not # tier-2-sandbox-controlled, so the audit ignores them. SKIP_TOP_DIRS = {"tests", "conductor"} def load_patterns(config_path: Path) -> list[str]: """Load substring patterns from the denylist config. Lines starting with '#' and blank lines are skipped. CR is stripped (Windows line endings). Each remaining line is a substring to look for in file paths. """ if not config_path.exists(): return [] patterns = [] for raw in config_path.read_text(encoding="utf-8").splitlines(): line = raw.rstrip("\r") stripped = line.strip() if not stripped or stripped.startswith("#"): continue patterns.append(stripped) return patterns def collect_leaks(repo_root: Path, patterns: list[str]) -> list[dict]: """Walk the working tree and return files matching any forbidden pattern. Each entry: {"path": str (relative), "status": "untracked"|"modified"}. "modified" = in HEAD but modified in working tree (leak drift in progress). "untracked" = not in HEAD (a leak staged via git add but not committed yet, OR a leak as a new untracked file). Tracked-but-clean files are NOT reported. The main repo's opencode.json, mcp_paths.toml, and other tracked forbidden patterns are legitimate; they are not leaks. Only files that have been MODIFIED locally (or are NEW) indicate sandbox drift. """ if not patterns: return [] # Get the set of modified-status from git. This avoids walking # node_modules and other ignored directories ourselves. try: modified_proc = subprocess.run( ["git", "diff", "--name-only", "-z", "--no-renames"], cwd=str(repo_root), capture_output=True, check=True, ) modified = { p.decode("utf-8") if isinstance(p, bytes) else p for p in modified_proc.stdout.split(b"\0") if p } except subprocess.CalledProcessError: modified = set() # Get tracked files for the untracked check (a path is untracked iff # not in `git ls-files`). try: tracked_proc = subprocess.run( ["git", "ls-files", "-z"], cwd=str(repo_root), capture_output=True, check=True, ) tracked = { p.decode("utf-8") if isinstance(p, bytes) else p for p in tracked_proc.stdout.split(b"\0") if p } except subprocess.CalledProcessError: tracked = set() leaks: list[dict] = [] # Scan modified files (tracked but changed in working tree) for rel_path in sorted(modified): if any(pat in rel_path for pat in patterns): leaks.append({"path": rel_path, "status": "modified"}) # Walk the working tree to catch untracked leaks. We do this manually # (rather than git ls-files --others --exclude-standard) to keep the # SKIP_DIRS rules visible in this script. for path in repo_root.rglob("*"): if not path.is_file(): continue rel = path.relative_to(repo_root).as_posix() # Skip top-level project directories (tests, conductor) plus the # standard ignored dirs. parts = path.relative_to(repo_root).parts if parts[0] in SKIP_TOP_DIRS: continue if any(part in SKIP_DIRS for part in parts): continue # Skip the pre-commit hook's temp file if rel.startswith(".tier2_leaked_"): continue if rel in tracked: continue # already handled above if any(pat in rel for pat in patterns): leaks.append({"path": rel, "status": "untracked"}) # De-duplicate (in case a path appears in multiple sources) seen: set[str] = set() unique: list[dict] = [] for leak in leaks: if leak["path"] not in seen: seen.add(leak["path"]) unique.append(leak) return unique def render_human(leaks: list[dict]) -> str: """Format the leak report for terminal output.""" if not leaks: return "[OK] No tier-2 sandbox-only files detected in the working tree.\n" out = [f"[LEAK] Found {len(leaks)} tier-2 sandbox-only file(s):", ""] for leak in leaks: out.append(f" {leak['status']:9s} {leak['path']}") out.append("") out.append("These files belong in the main repo only; they are modified by") out.append("scripts/tier2/setup_tier2_clone.ps1 in the tier-2 clone.") out.append("If committed, they would absorb the sandbox's local config drift.") out.append("To remove from the working tree: git rm --cached ") return "\n".join(out) + "\n" def render_json(leaks: list[dict]) -> str: """Format the leak report as JSON for machine consumption.""" return json.dumps( { "files": leaks, "summary": { "total": len(leaks), "untracked": sum(1 for l in leaks if l["status"] == "untracked"), "modified": sum(1 for l in leaks if l["status"] == "modified"), }, }, indent=2, ) def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser(description=__doc__.split("\n")[0]) parser.add_argument( "--strict", action="store_true", help="Exit 1 if any leak is detected. Default: exit 0 (informational).", ) parser.add_argument( "--json", action="store_true", help="Emit machine-readable JSON instead of the human-readable report.", ) args = parser.parse_args(argv) repo_root = Path.cwd() config_path = repo_root / CONFIG_REL patterns = load_patterns(config_path) if not patterns: print( f"warning: no forbidden patterns loaded from {config_path}; audit is a no-op.", file=sys.stderr, ) leaks: list[dict] = [] else: leaks = collect_leaks(repo_root, patterns) if args.json: print(render_json(leaks)) else: print(render_human(leaks), end="") return 1 if (args.strict and leaks) else 0 if __name__ == "__main__": sys.exit(main())