Private
Public Access
0
0

feat(audit): add audit_tier2_leaks.py for tier-2 sandbox file leak detection

Adds scripts/audit_tier2_leaks.py as defense-in-depth layer 3 (the
pre-commit hook is layer 2; OpenCode permission rules are layer 1).
The audit scans the main repo's working tree for files matching the
forbidden patterns in conductor/tier2/githooks/forbidden-files.txt.

Behavior:
- Default mode (exit 0): informational report of any leaks found.
  Useful for manual inspection and pre-commit workflow.
- --strict mode (exit 1 if leaks): CI gate. The hook at the commit
  boundary is the live guard; this is the safety net for any leak
  that somehow slips through (manual edits, ops mistakes).
- --json mode: machine-readable output for CI integration.

Detection rules:
- "untracked" status: file exists in working tree but is not in
  HEAD and not in `git ls-files`. Indicates a leak as a new file.
- "modified" status: file is in HEAD but the working tree differs.
  Indicates a leak in progress (tier-2 setup modified a file).
- Files that are tracked and unmodified are NOT reported: the main
  repo legitimately tracks opencode.json, mcp_paths.toml, etc. —
  the patterns are about CONTENT (modifications by tier-2), not
  file existence.

Skip rules:
- .git/, node_modules/, __pycache__/, .venv/, venv/ (ignored dirs)
- tests/ (test infrastructure, not user code)
- conductor/ (canonical source for tier-2 files; if they're here
  in a leak, they were committed, not just sitting in working tree)
- .tier2_leaked_* (the pre-commit hook's temp file)

Missing config file: warn to stderr, exit 0 with empty report. The
hook also no-ops in this case; both layers degrade safely.

Tests (tests/test_audit_tier2_leaks.py, 13 cases):
- Clean tree returns 0
- Each forbidden file type detected (agent, command, opencode.json,
  mcp_paths.toml)
- Non-forbidden files ignored (including legitimate
  conductor/tier2/agents/tier2-tech-lead.md which contains 'tier2-'
  in path)
- Strict mode exits 1 on leak, 0 when clean
- Default mode reports leaks but exits 0
- Missing config handled gracefully
- --json output shape stable
- Summary counts correct

All 13 pass.
This commit is contained in:
2026-06-20 01:47:23 -04:00
parent 81e1fd7b2c
commit f5d8ea047a
2 changed files with 396 additions and 0 deletions
+210
View File
@@ -0,0 +1,210 @@
"""Audit for tier-2 sandbox-only files leaking into the main repo.
Defense-in-depth layer 3 (after the pre-commit hook at the commit
boundary): scans the working tree for files matching the forbidden
patterns in conductor/tier2/githooks/forbidden-files.txt. If any
match, the file is reported as a leak.
Usage:
uv run python scripts/audit_tier2_leaks.py # informational
uv run python scripts/audit_tier2_leaks.py --strict # CI gate (exit 1)
uv run python scripts/audit_tier2_leaks.py --json # machine-readable
Behavior:
- Walks the working tree, skipping .git/, node_modules/, and
__pycache__/ (anything git would ignore at the build level)
- For each candidate file, checks if its relative path contains
any forbidden pattern as a substring
- Reports each leak with its path and status (untracked/modified)
- Default mode exits 0; --strict mode exits 1 if any leaks
This script is the manual/CI guard. The pre-commit hook at
conductor/tier2/githooks/pre-commit is the live guard; both layers
must be present for the defense-in-depth contract to hold.
"""
import argparse
import json
import subprocess
import sys
from pathlib import Path
CONFIG_REL = Path("conductor/tier2/githooks/forbidden-files.txt")
SKIP_DIRS = {".git", "node_modules", "__pycache__", ".venv", "venv"}
# Test infrastructure and the canonical source directory for tier-2
# files. Tests/ and conductor/tier2/ are project-controlled, not
# tier-2-sandbox-controlled, so the audit ignores them.
SKIP_TOP_DIRS = {"tests", "conductor"}
def load_patterns(config_path: Path) -> list[str]:
"""Load substring patterns from the denylist config.
Lines starting with '#' and blank lines are skipped. CR is stripped
(Windows line endings). Each remaining line is a substring to look
for in file paths.
"""
if not config_path.exists():
return []
patterns = []
for raw in config_path.read_text(encoding="utf-8").splitlines():
line = raw.rstrip("\r")
stripped = line.strip()
if not stripped or stripped.startswith("#"):
continue
patterns.append(stripped)
return patterns
def collect_leaks(repo_root: Path, patterns: list[str]) -> list[dict]:
"""Walk the working tree and return files matching any forbidden pattern.
Each entry: {"path": str (relative), "status": "untracked"|"modified"}.
"modified" = in HEAD but modified in working tree (leak drift in progress).
"untracked" = not in HEAD (a leak staged via git add but not committed yet,
OR a leak as a new untracked file).
Tracked-but-clean files are NOT reported. The main repo's
opencode.json, mcp_paths.toml, and other tracked forbidden patterns
are legitimate; they are not leaks. Only files that have been
MODIFIED locally (or are NEW) indicate sandbox drift.
"""
if not patterns:
return []
# Get the set of modified-status from git. This avoids walking
# node_modules and other ignored directories ourselves.
try:
modified_proc = subprocess.run(
["git", "diff", "--name-only", "-z", "--no-renames"],
cwd=str(repo_root),
capture_output=True,
check=True,
)
modified = {
p.decode("utf-8") if isinstance(p, bytes) else p
for p in modified_proc.stdout.split(b"\0")
if p
}
except subprocess.CalledProcessError:
modified = set()
# Get tracked files for the untracked check (a path is untracked iff
# not in `git ls-files`).
try:
tracked_proc = subprocess.run(
["git", "ls-files", "-z"],
cwd=str(repo_root),
capture_output=True,
check=True,
)
tracked = {
p.decode("utf-8") if isinstance(p, bytes) else p
for p in tracked_proc.stdout.split(b"\0")
if p
}
except subprocess.CalledProcessError:
tracked = set()
leaks: list[dict] = []
# Scan modified files (tracked but changed in working tree)
for rel_path in sorted(modified):
if any(pat in rel_path for pat in patterns):
leaks.append({"path": rel_path, "status": "modified"})
# Walk the working tree to catch untracked leaks. We do this manually
# (rather than git ls-files --others --exclude-standard) to keep the
# SKIP_DIRS rules visible in this script.
for path in repo_root.rglob("*"):
if not path.is_file():
continue
rel = path.relative_to(repo_root).as_posix()
# Skip top-level project directories (tests, conductor) plus the
# standard ignored dirs.
parts = path.relative_to(repo_root).parts
if parts[0] in SKIP_TOP_DIRS:
continue
if any(part in SKIP_DIRS for part in parts):
continue
# Skip the pre-commit hook's temp file
if rel.startswith(".tier2_leaked_"):
continue
if rel in tracked:
continue # already handled above
if any(pat in rel for pat in patterns):
leaks.append({"path": rel, "status": "untracked"})
# De-duplicate (in case a path appears in multiple sources)
seen: set[str] = set()
unique: list[dict] = []
for leak in leaks:
if leak["path"] not in seen:
seen.add(leak["path"])
unique.append(leak)
return unique
def render_human(leaks: list[dict]) -> str:
"""Format the leak report for terminal output."""
if not leaks:
return "[OK] No tier-2 sandbox-only files detected in the working tree.\n"
out = [f"[LEAK] Found {len(leaks)} tier-2 sandbox-only file(s):", ""]
for leak in leaks:
out.append(f" {leak['status']:9s} {leak['path']}")
out.append("")
out.append("These files belong in the main repo only; they are modified by")
out.append("scripts/tier2/setup_tier2_clone.ps1 in the tier-2 clone.")
out.append("If committed, they would absorb the sandbox's local config drift.")
out.append("To remove from the working tree: git rm --cached <path>")
return "\n".join(out) + "\n"
def render_json(leaks: list[dict]) -> str:
"""Format the leak report as JSON for machine consumption."""
return json.dumps(
{
"files": leaks,
"summary": {
"total": len(leaks),
"untracked": sum(1 for l in leaks if l["status"] == "untracked"),
"modified": sum(1 for l in leaks if l["status"] == "modified"),
},
},
indent=2,
)
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(description=__doc__.split("\n")[0])
parser.add_argument(
"--strict",
action="store_true",
help="Exit 1 if any leak is detected. Default: exit 0 (informational).",
)
parser.add_argument(
"--json",
action="store_true",
help="Emit machine-readable JSON instead of the human-readable report.",
)
args = parser.parse_args(argv)
repo_root = Path.cwd()
config_path = repo_root / CONFIG_REL
patterns = load_patterns(config_path)
if not patterns:
print(
f"warning: no forbidden patterns loaded from {config_path}; audit is a no-op.",
file=sys.stderr,
)
leaks: list[dict] = []
else:
leaks = collect_leaks(repo_root, patterns)
if args.json:
print(render_json(leaks))
else:
print(render_human(leaks), end="")
return 1 if (args.strict and leaks) else 0
if __name__ == "__main__":
sys.exit(main())