#!/bin/sh
# Tier 2 autonomous mode: prevent sandbox-only file leaks.
#
# setup_tier2_clone.ps1 modifies opencode.json and mcp_paths.toml in the
# clone (C:\projects\manual_slop_tier2\), and copies the tier-2 agent
# prompt + slash command from conductor/tier2/ into .opencode/. If a
# tier-2 commit captures any of these via `git add .`, the main repo
# would absorb the sandbox's local config drift.
#
# This hook runs on `git commit` in the tier-2 clone. It reads the
# denylist from conductor/tier2/githooks/forbidden-files.txt and
# auto-unstages any staged file whose path contains a forbidden
# substring. The commit then proceeds with only the legitimate work.
#
# Layer 1 (OpenCode permission system) blocks the tier-2 agent from
# editing these files directly. This hook is the backup layer at the
# commit boundary. Layer 3 is the audit script
# scripts/audit_tier2_leaks.py in the main repo.
#
# Why auto-unstage instead of exit 1: tier-2 cannot run `git restore
# --staged` (banned by the sandbox permission rules), so a hard reject
# would leave the agent stuck mid-flow. Auto-unstage + warn is the
# recoverable behavior.
#
# Why exit 0 always: the hook must never block the agent. Its job is to
# remove the leak, not to gate the commit. The failcount machinery in
# scripts/tier2/failcount.py tracks repeated red-phase failures and
# gives up the run; adding a hook-induced exit 1 would pollute that
# signal.

CONFIG="conductor/tier2/githooks/forbidden-files.txt"

if [ ! -f "$CONFIG" ]; then
 exit 0
fi

# POSIX shells cannot store NUL bytes in variables (command substitution
# strips them). So we cannot do `STAGED=$(git diff -z)` and iterate.
# Instead, pipe `git diff -z` into a `while read -d ''` loop in a
# subshell, and write leaked paths to a temp file. The parent shell then
# reads the temp file and unstages via `git rm --cached`.
TMPFILE="./.tier2_leaked_$$"
trap 'rm -f "$TMPFILE" 2>/dev/null' EXIT

# Check if any staged file matches any forbidden substring.
# Pattern matching strategy: for each staged file, iterate the config
# file's non-comment, non-blank lines. Each pattern is a substring to
# look for in the file path. `case "$f" in *"$pattern"*)` is faster
# than spawning `grep` per file.
#
# CRITICAL: the config file may have CRLF line endings (the test writes
# it via Python's text mode on Windows). Strip trailing \r from each
# pattern before matching, otherwise `*pattern*` will not match a
# clean path because the pattern contains a stray carriage return.
git diff --cached --name-only -z | while IFS= read -r -d '' f; do
 [ -z "$f" ] && continue
 while IFS= read -r pattern || [ -n "$pattern" ]; do
 # Strip trailing \r (CRLF line endings on Windows)
 pattern=$(printf '%s' "$pattern" | tr -d '\r')
 case "$pattern" in
 ''|'#'*) continue ;;
 esac
 case "$f" in
 *"$pattern"*)
 printf '%s\n' "$f" >> "$TMPFILE"
 break
 ;;
 esac
 done < "$CONFIG"
done

if [ ! -s "$TMPFILE" ]; then
 exit 0
fi

echo "Tier 2: removing sandbox-only files from staging" >&2
echo "(these files belong in the main repo, not in tier-2 commits):" >&2
while IFS= read -r f; do
 [ -z "$f" ] && continue
 echo " - $f" >&2
 # `git rm --cached` works on tracked files (unstages modifications)
 # AND on newly-added files (unstages the addition, file becomes
 # untracked again). NOT `git restore` (banned in sandbox).
 #
 # `--force` is required when the index has content that differs from
 # BOTH HEAD and the working tree (e.g., the file was modified,
 # staged, then modified again in the working tree). Without
 # --force, git refuses to discard the staged content.
 git rm --cached --quiet --force "$f" 2>/dev/null || true
done < "$TMPFILE"

echo "" >&2
echo "Commit will proceed without these files. To inspect what was" >&2
echo "removed, run: git status" >&2

exit 0