manual_slop/scripts/audit_tier2_leaks.py

"""Audit for tier-2 sandbox-only files leaking into the main repo.

Defense-in-depth layer 3 (after the pre-commit hook at the commit
boundary): scans the working tree for files matching the forbidden
patterns in conductor/tier2/githooks/forbidden-files.txt. If any
match, the file is reported as a leak.

Usage:
 uv run python scripts/audit_tier2_leaks.py             # informational
 uv run python scripts/audit_tier2_leaks.py --strict    # CI gate (exit 1)
 uv run python scripts/audit_tier2_leaks.py --json       # machine-readable

Behavior:
- Walks the working tree, skipping .git/, node_modules/, and
  __pycache__/ (anything git would ignore at the build level)
- For each candidate file, checks if its relative path contains
  any forbidden pattern as a substring
- Reports each leak with its path and status (untracked/modified)
- Default mode exits 0; --strict mode exits 1 if any leaks

This script is the manual/CI guard. The pre-commit hook at
conductor/tier2/githooks/pre-commit is the live guard; both layers
must be present for the defense-in-depth contract to hold.
"""
import argparse
import json
import subprocess
import sys
from pathlib import Path

CONFIG_REL = Path("conductor/tier2/githooks/forbidden-files.txt")
SKIP_DIRS = {".git", "node_modules", "__pycache__", ".venv", "venv"}
# Test infrastructure and the canonical source directory for tier-2
# files. Tests/ and conductor/tier2/ are project-controlled, not
# tier-2-sandbox-controlled, so the audit ignores them.
SKIP_TOP_DIRS = {"tests", "conductor"}


def load_patterns(config_path: Path) -> list[str]:
 """Load substring patterns from the denylist config.

 Lines starting with '#' and blank lines are skipped. CR is stripped
 (Windows line endings). Each remaining line is a substring to look
 for in file paths.
 """
 if not config_path.exists():
  return []
 patterns = []
 for raw in config_path.read_text(encoding="utf-8").splitlines():
  line = raw.rstrip("\r")
  stripped = line.strip()
  if not stripped or stripped.startswith("#"):
   continue
  patterns.append(stripped)
 return patterns


def collect_leaks(repo_root: Path, patterns: list[str]) -> list[dict]:
 """Walk the working tree and return files matching any forbidden pattern.

 Each entry: {"path": str (relative), "status": "untracked"|"modified"}.
 "modified" = in HEAD but modified in working tree (leak drift in progress).
 "untracked" = not in HEAD (a leak staged via git add but not committed yet,
 OR a leak as a new untracked file).

 Tracked-but-clean files are NOT reported. The main repo's
 opencode.json, mcp_paths.toml, and other tracked forbidden patterns
 are legitimate; they are not leaks. Only files that have been
 MODIFIED locally (or are NEW) indicate sandbox drift.
 """
 if not patterns:
  return []
 # Force git to operate ONLY on repo_root. By default, git searches
 # upward for a parent .git/ directory; if repo_root happens to be a
 # subdirectory of the parent repo (e.g., a tmp_path fixture inside
 # the project tree), git would otherwise report the PARENT's modified
 # files as if they belonged to repo_root. Pointing GIT_DIR at a
 # non-existent path forces git commands to fail with a clear error,
 # which we treat as 'no modifications' / 'no tracked files'.
 import os
 ceiling_env = {**os.environ, "GIT_DIR": str(repo_root.resolve() / ".git")}
 # Get the set of modified-status from git. This avoids walking
 # node_modules and other ignored directories ourselves.
 try:
  modified_proc = subprocess.run(
   ["git", "diff", "--name-only", "-z", "--no-renames"],
   cwd=str(repo_root),
   capture_output=True,
   check=True,
   env=ceiling_env,
  )
  modified = {
   p.decode("utf-8") if isinstance(p, bytes) else p
   for p in modified_proc.stdout.split(b"\0")
   if p
  }
 except subprocess.CalledProcessError:
  modified = set()

 # Get tracked files for the untracked check (a path is untracked iff
 # not in `git ls-files`).
 try:
  tracked_proc = subprocess.run(
   ["git", "ls-files", "-z"],
   cwd=str(repo_root),
   capture_output=True,
   check=True,
   env=ceiling_env,
  )
  tracked = {
   p.decode("utf-8") if isinstance(p, bytes) else p
   for p in tracked_proc.stdout.split(b"\0")
   if p
  }
 except subprocess.CalledProcessError:
  tracked = set()

 leaks: list[dict] = []
 # Scan modified files (tracked but changed in working tree)
 for rel_path in sorted(modified):
  if any(pat in rel_path for pat in patterns):
   leaks.append({"path": rel_path, "status": "modified"})

 # Walk the working tree to catch untracked leaks. We do this manually
 # (rather than git ls-files --others --exclude-standard) to keep the
 # SKIP_DIRS rules visible in this script.
 for path in repo_root.rglob("*"):
  if not path.is_file():
   continue
  rel = path.relative_to(repo_root).as_posix()
  # Skip top-level project directories (tests, conductor) plus the
  # standard ignored dirs.
  parts = path.relative_to(repo_root).parts
  if parts[0] in SKIP_TOP_DIRS:
   continue
  if any(part in SKIP_DIRS for part in parts):
   continue
  # Skip the pre-commit hook's temp file
  if rel.startswith(".tier2_leaked_"):
   continue
  if rel in tracked:
   continue # already handled above
  if any(pat in rel for pat in patterns):
   leaks.append({"path": rel, "status": "untracked"})

 # De-duplicate (in case a path appears in multiple sources)
 seen: set[str] = set()
 unique: list[dict] = []
 for leak in leaks:
  if leak["path"] not in seen:
   seen.add(leak["path"])
   unique.append(leak)
 return unique


def render_human(leaks: list[dict]) -> str:
 """Format the leak report for terminal output."""
 if not leaks:
  return "[OK] No tier-2 sandbox-only files detected in the working tree.\n"
 out = [f"[LEAK] Found {len(leaks)} tier-2 sandbox-only file(s):", ""]
 for leak in leaks:
  out.append(f" {leak['status']:9s} {leak['path']}")
 out.append("")
 out.append("These files belong in the main repo only; they are modified by")
 out.append("scripts/tier2/setup_tier2_clone.ps1 in the tier-2 clone.")
 out.append("If committed, they would absorb the sandbox's local config drift.")
 out.append("To remove from the working tree: git rm --cached <path>")
 return "\n".join(out) + "\n"


def render_json(leaks: list[dict]) -> str:
 """Format the leak report as JSON for machine consumption."""
 return json.dumps(
  {
   "files": leaks,
   "summary": {
    "total": len(leaks),
    "untracked": sum(1 for l in leaks if l["status"] == "untracked"),
    "modified": sum(1 for l in leaks if l["status"] == "modified"),
   },
  },
  indent=2,
 )


def main(argv: list[str] | None = None) -> int:
 parser = argparse.ArgumentParser(description=__doc__.split("\n")[0])
 parser.add_argument(
  "--strict",
  action="store_true",
  help="Exit 1 if any leak is detected. Default: exit 0 (informational).",
 )
 parser.add_argument(
  "--json",
  action="store_true",
  help="Emit machine-readable JSON instead of the human-readable report.",
 )
 args = parser.parse_args(argv)

 repo_root = Path.cwd()
 config_path = repo_root / CONFIG_REL
 patterns = load_patterns(config_path)
 if not patterns:
  print(
   f"warning: no forbidden patterns loaded from {config_path}; audit is a no-op.",
   file=sys.stderr,
  )
  leaks: list[dict] = []
 else:
  leaks = collect_leaks(repo_root, patterns)

 if args.json:
  print(render_json(leaks))
 else:
  print(render_human(leaks), end="")

 return 1 if (args.strict and leaks) else 0


if __name__ == "__main__":
 sys.exit(main())