pikuma_ps1/code/duffle/gte.h

#ifdef INTELLISENSE_DIRECTIVES
#	pragma once
#	include "dsl.h"
#	include "math.h"
#	include "mips.h"
#endif

/* ============================================================================
 *  gte.h — Geometry Transformation Engine (COP2) for the PS1
 * ============================================================================
 *
 *  Hand-rolled DSL for emitting GTE/MIPS instruction words as raw `.word`
 *  constants from C. No GCC inline-assembly string syntax in the code body.
 *
 *  PHILOSOPHY
 *  ----------
 *  1. A 32-bit instruction word is composed from per-field encoders. Each
 *     encoder knows only its own bit range; the composite ORs them together.
 *     No magic numbers inside any encoder body — every shift and mask is a
 *     named constant from the bitfield-layout enum below.
 *
 *  2. Pure (compile-time) instructions — every GTE *command* (RTPS, RTPT,
 *     NCLIP, MVMVA, …) and every COP2 *transfer* (ctc2/cfc2) with a constant
 *     rs/rt/rd — are emitted as a single integer constant via
 *     `asm_inline(...)` from gcc_asm.h. The C compiler constant-folds
 *     these into `.word` directives in .rodata.
 *
 *  3. Runtime-base-register instructions (lwc2, swc2, lw, sw, …) cannot be
 *     a pure compile-time word because the `rs` field is chosen by the
 *     compiler at codegen. For these we use a "placeholder-pun" pattern:
 *     a fixed register number (R_T4 = $12) is baked into the rs field of
 *     the `.word` constant, and the macro declares a `"r"(arg)` input
 *     constraint plus a clobber on the same register. The compiler is
 *     therefore *forced* to bind `arg` to that exact register, and the
 *     constant is correct.
 *
 *  USAGE
 *  -----
 *      // Pure command sequence — all bits compile-time:
 *      asm volatile(
 *          asm_inline( gte_cmd_rtpt , gte_cmd_nclip , gte_cmd_avsz3 )
 *          asm_clobber( clb_system )
 *      );
 *
 *      // Runtime-base-register load — caller picks the base GPR:
 *      register V3_S2* p_in_12 __asm__("$12") = verts[0].ptr;
 *      gte_load_v0(p_in_12, R_T4);   // R_T4 = 12 = $t4 = $12
 *
 *      // Three independent bases for an RTPT pipeline:
 *      register V3_S2* p0 __asm__("$12") = verts[0].ptr;
 *      register V3_S2* p1 __asm__("$13") = verts[1].ptr;
 *      register V3_S2* p2 __asm__("$14") = verts[2].ptr;
 *      gte_load_v0(p0, R_T4);
 *      gte_load_v1(p1, R_T5);
 *      gte_load_v2(p2, R_T6);
 *      gte_rtpt();
 *
 *  STYLE NOTES
 *  -----------
 *  - Per-field encoders are named `enc_gte_<field>(value)` and each one
 *    self-masks its argument before shifting. Mirrors the `enc_op / enc_rs
 *    / enc_rt / ...` family in mips.h.
 *  - The composite `enc_gte_cmdw(sf, mx, v, cv, lm, cmd)` is a flat OR of
 *    the per-field encoders, plus the COP2/CO base.
 *  - Pre-baked shortcuts (`gte_cmd_rtpt`, `gte_cmd_rtps`, …) are defined
 *    for the common cases so call sites read like assembly source.
 *  - All register/field values are enums (not `#define`s) so they show up
 *    in debugger symbol tables and IDE autocomplete.
 *
 *  SEE ALSO
 *  --------
 *  - gcc_asm.h: the `.word` emitter (`asm_inline`, `asm_clobber`, clobbers)
 *  - mips.h:    the MIPS encoder layer this builds on
 */

/* C2 data registers */

/* --- GTE Data Registers (Coprocessor 2) ---
 * Preprocessor-visible integer ids for the COP2 data register file.
 * Each enum value is bound to a parallel `_Code` `#define` so the
 * preprocessor can stringify the integer (for `reg_str`/`rgcc` paths).
 * Same pattern as the GPR `_Code` set in mips.h. */
#define C2_VXY0_Code  0
#define C2_VZ0_Code   1
#define C2_VXY1_Code  2
#define C2_VZ1_Code   3
#define C2_VXY2_Code  4
#define C2_VZ2_Code   5
#define C2_RGB_Code   6
#define C2_OTZ_Code   7
#define C2_IR0_Code   8
#define C2_IR1_Code   9
#define C2_IR2_Code  10
#define C2_IR3_Code  11
#define C2_SXY0_Code 12
#define C2_SXY1_Code 13
#define C2_SXY2_Code 14
#define C2_SXYP_Code 15
#define C2_SZ0_Code  16
#define C2_SZ1_Code  17
#define C2_SZ2_Code  18
#define C2_SZ3_Code  19
#define C2_RGB0_Code 20
#define C2_RGB1_Code 21
#define C2_RGB2_Code 22
#define C2_RES1_Code 23
#define C2_MAC0_Code 24
#define C2_MAC1_Code 25
#define C2_MAC2_Code 26
#define C2_MAC3_Code 27
#define C2_IRGB_Code 28
#define C2_ORGB_Code 29
#define C2_LZCS_Code 30
#define C2_LZCR_Code 31

enum {
	C2_VXY0 = C2_VXY0_Code, C2_VZ0  = C2_VZ0_Code,  C2_VXY1 = C2_VXY1_Code, C2_VZ1  = C2_VZ1_Code,
	C2_VXY2 = C2_VXY2_Code, C2_VZ2  = C2_VZ2_Code,  C2_RGB  = C2_RGB_Code,  C2_OTZ  = C2_OTZ_Code,
	C2_IR0  = C2_IR0_Code,  C2_IR1  = C2_IR1_Code,  C2_IR2  = C2_IR2_Code,  C2_IR3  = C2_IR3_Code,
	C2_SXY0 = C2_SXY0_Code, C2_SXY1 = C2_SXY1_Code, C2_SXY2 = C2_SXY2_Code, C2_SXYP = C2_SXYP_Code,
	C2_SZ0  = C2_SZ0_Code,  C2_SZ1  = C2_SZ1_Code,  C2_SZ2  = C2_SZ2_Code,  C2_SZ3  = C2_SZ3_Code,
	C2_RGB0 = C2_RGB0_Code, C2_RGB1 = C2_RGB1_Code, C2_RGB2 = C2_RGB2_Code, C2_RES1 = C2_RES1_Code,
	C2_MAC0 = C2_MAC0_Code, C2_MAC1 = C2_MAC1_Code, C2_MAC2 = C2_MAC2_Code, C2_MAC3 = C2_MAC3_Code,
	C2_IRGB = C2_IRGB_Code, C2_ORGB = C2_ORGB_Code, C2_LZCS = C2_LZCS_Code, C2_LZCR = C2_LZCR_Code
};

/* Semantic Aliases for GTE Data Registers */
enum {
	gte_in_v0_xy     = C2_VXY0, /* Input Vector 0 (X, Y) */
	gte_in_v0_z      = C2_VZ0,  /* Input Vector 0 (Z) */
	gte_in_v1_xy     = C2_VXY1, /* Input Vector 1 (X, Y) */
	gte_in_v1_z      = C2_VZ1,  /* Input Vector 1 (Z) */
	gte_in_v2_xy     = C2_VXY2, /* Input Vector 2 (X, Y) */
	gte_in_v2_z      = C2_VZ2,  /* Input Vector 2 (Z) */
	gte_in_rgb       = C2_RGB,  /* Input Color (R, G, B, Code) */
	gte_out_scr_xy0  = C2_SXY0, /* Output Screen Coord 0 (X, Y) */
	gte_out_scr_xy1  = C2_SXY1, /* Output Screen Coord 1 (X, Y) */
	gte_out_scr_xy2  = C2_SXY2, /* Output Screen Coord 2 (X, Y) */
	gte_out_depth    = C2_OTZ,  /* Output Ordering Table Z (Depth) */
	gte_math_accum0  = C2_MAC0, /* Math Accumulator 0 */
	gte_math_accum1  = C2_MAC1, /* Math Accumulator 1 */
	gte_math_accum2  = C2_MAC2, /* Math Accumulator 2 */
};

/* --- GTE Command Semantics (The Bitfield Meanings) ---
 * A GTE command is a single 32-bit word sent to COP2.
 * It is highly configurable via bitfields.
 */

enum {
/* Shift Fraction (Bit 19) - Determines fixed-point division */

	gte_sf_fractional = 0,  /* Divide result by 4096 (Standard 4.12 fixed point) */
	gte_sf_integer    = 1,  /* No division (Raw integer math) */

/* Matrix Select (Bits 18-17) - Which 3x3 matrix to multiply by */

	gte_mx_rotation   = 0,  /* Rotation Matrix (RT) */
	gte_mx_light      = 1,  /* Light Matrix (LL) */
	gte_mx_color      = 2,  /* Color Matrix (LC) */
	gte_mx_none       = 3,  /* Reserved / Do not multiply */

/* Vector select (Bits 16-15) - Which input vector to use */

	gte_v_v0        = 0,  /* Use Vector 0 (VXY0, VZ0) */
	gte_v_v1        = 1,  /* Use Vector 1 (VXY1, VZ1) */
	gte_v_v2        = 2,  /* Use Vector 2 (VXY2, VZ2) */
	gte_v_ir_regs   = 3,  /* Use Intermediate Registers (IR1, IR2, IR3) */

/* Control Vector Select (Bits 14-13) - Which vector to ADD after multiplication */

	gte_cv_translate  = 0,  /* Add Translation Vector (TRX, TRY, TRZ) */
	gte_cv_bg_color   = 1,  /* Add Background Color (RBK, GBK, BBK) */
	gte_cv_far_color  = 2,  /* Add Far Color (RFC, GFC, BFC) */
	gte_cv_none       = 3,  /* Add Zero (No addition) */

/* Limit/Clamp (Bit 10) - Prevents overflow artifacts */

	gte_lm_normal     = 0,  /* Normal math (can overflow) */
	gte_lm_clamp      = 1,  /* Clamp results to valid hardware ranges (e.g., RGB 0-255) */

/* Core Command IDs (Bits 5-0) */

	gte_cmd_rtps      = 0x01, /* Rot/Trans Perspective Single (1 vertex) */
	gte_cmd_rtpt      = 0x30, /* Rot/Trans Perspective Triple (3 vertices) */
	gte_cmd_nclip     = 0x06, /* Normal Clipping (Backface culling) */
	gte_cmd_op        = 0x0C, /* Outer Product */
	gte_cmd_mvmva     = 0x12, /* Matrix Vector Multiply & Add (Custom math) */

/* --- GTE Command Bit-Field Layout ---
 * A GTE command word (sent to COP2 with RS=1) is laid out as:
 *
 *   31........25 24 23..19 18..17 16..15 14..13 12..11 10  9.......6  5.......0
 *   +------------+--+-----+------+------+------+------+---+--------+----------+
 *   |  0x3E (COP2)| 1|  -- |  sf  |  mx  |  v   |  cv  | --|  lm  |  -- |  cmd  |
 *   +------------+--+-----+------+------+------+------+---+--------+----------+
 *                                    \_____ GTE_PAYLOAD _____/       \__ GTE_CMD __/
 *
 * Shifts/masks below are the *bit positions* and *bit widths* of each
 * configurable field, used by the ENC_GTE_CMD encoder. Mirrors the
 * OPCODE_SHIFT / RS_SHIFT convention used in mips.h.
 */

	gte_shift_sf  = 19,  gte_width_sf  = 1,  gte_mask_sf  = 0x1,
	gte_shift_mx  = 17,  gte_width_mx  = 2,  gte_mask_mx  = 0x3,
	gte_shift_v   = 15,  gte_width_v   = 2,  gte_mask_v   = 0x3,
	gte_shift_cv  = 13,  gte_width_cv  = 2,  gte_mask_cv  = 0x3,
	gte_shift_lm  = 10,  gte_width_lm  = 1,  gte_mask_lm  = 0x1,
	gte_shift_cmd =  0,  gte_width_cmd = 6,  gte_mask_cmd = 0x3F,
};

/* --- GTE Control Register Indices (for ctc2/cfc2) ---
 * Preprocessor-visible integer ids for the COP2 control register file.
 * Each enum value is bound to a parallel `_Code` `#define` so the
 * preprocessor can stringify the integer (for `reg_str`/`rgcc` paths).
 * Same pattern as the GPR `_Code` set in mips.h. Note: indices 21-23
 * are reserved/unused on real hardware, so there's a gap. */
#define gte_cr_RT11_Code  0
#define gte_cr_RT12_Code  1
#define gte_cr_RT13_Code  2
#define gte_cr_RT21_Code  3
#define gte_cr_RT22_Code  4
#define gte_cr_RT23_Code  5
#define gte_cr_RT31_Code  6
#define gte_cr_RT32_Code  7
#define gte_cr_RT33_Code  8
#define gte_cr_TRX_Code   9
#define gte_cr_TRY_Code  10
#define gte_cr_TRZ_Code  11
#define gte_cr_L11_Code  12
#define gte_cr_L12_Code  13
#define gte_cr_L13_Code  14
#define gte_cr_L21_Code  15
#define gte_cr_L22_Code  16
#define gte_cr_L23_Code  17
#define gte_cr_LR1_Code  18
#define gte_cr_LR2_Code  19
#define gte_cr_LR3_Code  20
#define gte_cr_RBK_Code  24
#define gte_cr_GBK_Code  25
#define gte_cr_BBK_Code  26
#define gte_cr_RFC_Code  27
#define gte_cr_GFC_Code  28
#define gte_cr_BFC_Code  29
#define gte_cr_OFX_Code  30
#define gte_cr_OFY_Code  31

enum {
	gte_cr_RT11 = gte_cr_RT11_Code, gte_cr_RT12 = gte_cr_RT12_Code, gte_cr_RT13 = gte_cr_RT13_Code,
	gte_cr_RT21 = gte_cr_RT21_Code, gte_cr_RT22 = gte_cr_RT22_Code, gte_cr_RT23 = gte_cr_RT23_Code,
	gte_cr_RT31 = gte_cr_RT31_Code, gte_cr_RT32 = gte_cr_RT32_Code, gte_cr_RT33 = gte_cr_RT33_Code,
	gte_cr_TRX  = gte_cr_TRX_Code,  gte_cr_TRY  = gte_cr_TRY_Code,  gte_cr_TRZ  = gte_cr_TRZ_Code,
	gte_cr_L11  = gte_cr_L11_Code,  gte_cr_L12  = gte_cr_L12_Code,  gte_cr_L13  = gte_cr_L13_Code,
	gte_cr_L21  = gte_cr_L21_Code,  gte_cr_L22  = gte_cr_L22_Code,  gte_cr_L23  = gte_cr_L23_Code,
	gte_cr_LR1  = gte_cr_LR1_Code,  gte_cr_LR2  = gte_cr_LR2_Code,  gte_cr_LR3  = gte_cr_LR3_Code,
	gte_cr_RBK  = gte_cr_RBK_Code,  gte_cr_GBK  = gte_cr_GBK_Code,  gte_cr_BBK  = gte_cr_BBK_Code,
	gte_cr_RFC  = gte_cr_RFC_Code,  gte_cr_GFC  = gte_cr_GFC_Code,  gte_cr_BFC  = gte_cr_BFC_Code,
	gte_cr_OFX  = gte_cr_OFX_Code,  gte_cr_OFY  = gte_cr_OFY_Code,
};

enum { _C2_OPS_ = 0

	, op_lwc2    = 0x32 /* Load Word  to   Coprocessor 2 (GTE) */
	, op_swc2    = 0x3A /* Store Word from Coprocessor 2 (GTE) */
};

/* COP2 (GTE) Transfer Format: ctc2 rt, rd or cfc2 rt, rd
 * Layout: [op_cop2:6][sub:5][rt:5][rd:5][0:11]
 *   - sub: cop_mf (0x00) for cfc2, cop_mt (0x04) for ctc2
 *   - rt:  GPR source/dest
 *   - rd:  COP2 control register index (0..31) */
#define enc_gte_tx(sub, rt, rd) (enc_op(op_cop2) | enc_rs(sub) | enc_rt(rt) | enc_rd(rd))

// #define gte_mt(rt, rd) enc_gte_tx(cop_mt, (rt), (rd)) /* Move GPR (rt) to GTE Control Register (rd) */
// #define gte_mf(rt, rd) enc_gte_tx(cop_mf, (rt), (rd)) /* Move GTE Control Register (rd) to GPR (rt) */

/* Explicit GTE Data vs Control Register Transfers */
#define gte_mf(rt, rd) enc_gte_tx(0x00, (rt), (rd)) /* Move from GTE Data Reg (e.g. MAC0, OTZ) */
#define gte_cf(rt, rd) enc_gte_tx(0x02, (rt), (rd)) /* Move from GTE Control Reg */
#define gte_mt(rt, rd) enc_gte_tx(0x04, (rt), (rd)) /* Move to GTE Data Reg (e.g. VXY0) */
#define gte_ct(rt, rd) enc_gte_tx(0x06, (rt), (rd)) /* Move to GTE Control Reg (e.g. Matrices) */

/* COP2 Data Load (lwc2): `lwc2 rt, off(rs)`
 * Layout: [op_lwc2:6][rs:5][rt:5][imm:16]
 *   - rs: GPR base address
 *   - rt: COP2 data register index (0..31)
 *   - imm: signed 16-bit offset
 * NOTE: When `rs` is a runtime register, the encoding cannot be pre-baked
 * into a .word — use the string-style `gte_load_v0` macro below instead. */
#define enc_gte_lw(rt, base, off) enc_i(op_lwc2, (base), (rt), (off))
/* Store Word */
#define enc_gte_sw(rt, base, off) enc_i(op_swc2, (base), (rt), (off))

/* Semantic aliases for the COP2 data load/store. The `c2` in `lwc2`/
 * `swc2` is redundant when we're already inside the `gte_` namespace.
 *   gte_lw rt, base, off  →  lwc2 rt, off(base)
 *   gte_sw rt, base, off  →  swc2 rt, off(base)
 * For the typical user-facing vector-level load (xy + z as two
 * instructions), use the higher-level `gte_load_vN` macros below. */
#define gte_lw(rt, base, off) enc_gte_lw(rt, base, off)
#define gte_sw(rt, base, off) enc_gte_sw(rt, base, off)

/* GTE Command Format (The math engine trigger)
 * Opcode is always MIPS_OP_COP2, RS is always 1 (CO).
 * The lower 25 bits are the GTE-specific command payload.
 *
 * The granular `enc_gte_<field>(x)` macros below mirror the `enc_op`/`enc_rs`
 * pattern in mips.h: each one self-masks and shifts its own field, so a
 * caller can build up a GTE command piece by piece (handy for state-driven
 * MVMVA emitters that vary one field at a time).
 *
 * `ENC_GTE_CMD` is the all-in-one convenience for emitting a full command
 * word in one go. It just ORs the per-field encoders together. */
#define gte_cmd_base (enc_op(op_cop2) | (1 << 25))

/* Per-field encoders. Each one does (value & mask) << shift on its own. */
#define enc_gte_sf(sf)   (((sf)  & gte_mask_sf ) << gte_shift_sf )
#define enc_gte_mx(mx)   (((mx)  & gte_mask_mx ) << gte_shift_mx )
#define enc_gte_v(v)     (((v)   & gte_mask_v  ) << gte_shift_v  )
#define enc_gte_cv(cv)   (((cv)  & gte_mask_cv ) << gte_shift_cv )
#define enc_gte_lm(lm)   (((lm)  & gte_mask_lm ) << gte_shift_lm )
#define enc_gte_cmd(cmd) (((cmd) & gte_mask_cmd) << gte_shift_cmd)

/* Composite: all six GTE fields + the COP2/CO base. */
#define enc_gte_cmdw(sf, mx, v, cv, lm, cmd) ( \
     gte_cmd_base     \
   | enc_gte_sf(sf)   \
   | enc_gte_mx(mx)   \
   | enc_gte_v(v)     \
   | enc_gte_cv(cv)   \
   | enc_gte_lm(lm)   \
   | enc_gte_cmd(cmd) \
)

/* Pre-baked GTE command words for the common cases.
 *
 * These are pure compile-time integer constants — the C compiler
 * constant-folds them into `.word` directives in .rodata. Use them
 * inside `asm_inline(...)` blocks (see `gte_rtpt` below for the
 * canonical idiom).
 *
 * Decomposition (per the `enc_gte_<field>` definitions above):
 *   gte_cmdw_<name> = gte_cmd_base | enc_gte_cmd(<cmd>)

 * The SF/MX/V/CV/LM fields are all zero in the common cases (standard
 * rotation-matrix, no scaling factor, V0 vector, translation vector,
 * no clamp), so the only varying bits are the `cmd` field.
 *
 * Naming follows the file's convention: `gte_cmd_*` is the raw
 * 6-bit `cmd` field id, `gte_cmdw_*` is the fully-encoded 32-bit
 * instruction word ready to drop into a `.word` directive.
 *
 * --------------------------------------------------------------------------
 *  PsyQ-compatibility note (RTPS/RTPT):
 *  The original Sony PsyQ `inline_n.h` ships RTPT as `cop2 0x0280030` and
 *  RTPS as `cop2 0x0180001`. Both have `0x20` set in the upper-reserved
 *  region (bit 21) AND `sf=1` (bit 19) — i.e. the "no division" flag.
 *  Per psx-spec these bits are reserved/must-be-zero, but the real GTE
 *  hardware and PCSX-Redux's GTE model both IGNORE them on these two
 *  commands (the perspective divide happens regardless of `sf`).
 *
 *  If we emit a strictly-spec-compliant word (`sf=0`, reserved bits
 *  clear), PCSX-Redux's GTE checks those bits more strictly than the
 *  silicon does and RTPT silently no-ops — the floor's screen
 *  coordinates come out as raw projection-of-rotation (Z never
 *  divided), `nclip` ends up wrong, and the triangle is culled.
 *
 *  So for RTPS and RTPT we OR-in the `0x28` "PsyQ compat" pattern to
 *  match the working bit pattern everyone has shipped for 25 years.
 *  NCLIP/OP/MVMVA stay spec-clean — their reserved bits really are
 *  zero in the original PsyQ source.
 * --------------------------------------------------------------------------
 */
#define gte_cmdw_psyq_compat  (1u << 21 | enc_gte_sf(gte_sf_integer))

#define gte_cmdw_rtps   (gte_cmd_base | enc_gte_cmd(gte_cmd_rtps ) | gte_cmdw_psyq_compat)
#define gte_cmdw_rtpt   (gte_cmd_base | enc_gte_cmd(gte_cmd_rtpt ) | gte_cmdw_psyq_compat)
#define gte_cmdw_nclip  (gte_cmd_base | enc_gte_cmd(gte_cmd_nclip))
#define gte_cmdw_op     (gte_cmd_base | enc_gte_cmd(gte_cmd_op   ))
#define gte_cmdw_mvmva  (gte_cmd_base | enc_gte_cmd(gte_cmd_mvmva))

/**
 * @brief Loads a single SVECTOR to GTE vector register V0
 *
 * @details Loads values from an SVECTOR struct to GTE data registers C2_VXY0
 * (XY at offset 0) and C2_VZ0 (Z at offset 4) using `lwc2`.
 *
 * Uses string-style GCC inline asm with `%0` substitution because the
 * base register `r0` is a runtime GPR chosen by the compiler — it cannot
 * be encoded into a static `.word` constant.
 *
 * Usage:
 *   asm_gte_load_v0(svector_ptr);
 */

/* Pre-baked lwc2 encoding helpers parameterized on the base GPR.
 *
 * gte_lwc2_v0(base)  → lwc2 $0,  0(base)   ; C2_VXY0
 * gte_lwc2_v0z(base) → lwc2 $1,  4(base)   ; C2_VZ0
 * gte_lwc2_v1(base)  → lwc2 $2,  0(base)   ; C2_VXY1
 * gte_lwc2_v1z(base) → lwc2 $3,  4(base)   ; C2_VZ1
 * gte_lwc2_v2(base)  → lwc2 $4,  0(base)   ; C2_VXY2
 * gte_lwc2_v2z(base) → lwc2 $5,  4(base)   ; C2_VZ2
 *
 * `base` is the GPR number to bake into the .word constant's `rs` field.
 * These are pure compile-time integers; the C compiler constant-folds
 * them into .word directives. */

enum {
	GTE_Z_Offset = 4
};

#define gte_lw_v0(base)   enc_gte_lw(gte_in_v0_xy, (base), 0)
#define gte_lw_v0z(base)  enc_gte_lw(gte_in_v0_z,  (base), GTE_Z_Offset)
#define gte_lw_v1(base)   enc_gte_lw(gte_in_v1_xy, (base), 0)
#define gte_lw_v1z(base)  enc_gte_lw(gte_in_v1_z,  (base), GTE_Z_Offset)
#define gte_lw_v2(base)   enc_gte_lw(gte_in_v2_xy, (base), 0)
#define gte_lw_v2z(base)  enc_gte_lw(gte_in_v2_z,  (base), GTE_Z_Offset)

/* gte_load_vN(r_ptr, base) — placeholder-punned lwc2 loaders
 *
 * Emits `.word` constants encoding `lwc2 $N, off(<base>)` for the chosen
 * GTE vector register, where `<base>` is the GPR number you pass in
 * (typically one of R_T4..R_T9 for the standard "3-pointer" pattern).
 *
 * The caller MUST bind `r_ptr` to that same GPR via a register variable:
 *
 *     register V3_S2* p_in_12 __asm__("$12") = my_ptr;
 *     gte_load_v0(p_in_12, R_T4);   // R_T4 = 12, base is $12
 *
 * Then `"r"(r_ptr)` inside the asm binds to $12 (the only register
 * `p_in_12` can live in), which is exactly the register the .word
 * constants expect. A `"$12"` clobber would conflict with the
 * register-variable binding ("asm specifier for variable conflicts
 * with asm clobber list"), so we omit it. The other ABI-clobbers
 * ($2/$8/$9/$31) stay because the GTE instructions don't touch
 * caller-saved GPRs but the kernel does treat them as volatile.
 *
 * WHICH REGISTER TO PICK
 * ----------------------
 * Any caller-saved GPR is safe. Recommended default for an RTPT-style
 * 3-pointer pipeline:
 *   gte_load_v0(p0, R_T4);  // $12
 *   gte_load_v1(p1, R_T5);  // $13
 *   gte_load_v2(p2, R_T6);  // $14
 * Avoid $0 (zero), $1 (at), $26/$27 (k0/k1), $28-$31 (gp/sp/fp/ra).
 *
 * Shape of the generated `asm volatile (...)`:
 *   code section       : ".word %0, %1"                 (from asm_inline)
 *   outputs section    : (empty, the 2nd colon)
 *   inputs section     : "i"(w0), "i"(w1), "r"(r_ptr)   — r_ptr bound to <base>
 *   clobbers section   : "$2", "$8", ..., "memory"      (from asm_clobber)
 *   3 colons total, GCC-legal. No string-syntax mnemonics in the .word body.
 *
 * The `asm_clobber(...)` helper from gcc_asm.h prepends the colon that
 * starts the clobbers section. */
#define gte_load_v0(r_ptr, base) asm volatile( \
	asm_words( gte_lw_v0(base), gte_lw_v0z(base) ) \
	asm_rpins,   r_use(r_ptr)                                    \
	asm_clobber: rlit(R_V0_Code), rlit(R_T0_Code), rlit(R_T1_Code), rlit(R_RA_Code), clb_mem_drain \
)

#define gte_load_v1(r_ptr, base) asm volatile( \
	asm_words( gte_lw_v1(base), gte_lw_v1z(base) ) \
	asm_rpins,   r_use(r_ptr)                                    \
	asm_clobber: rlit(R_V0_Code), rlit(R_T0_Code), rlit(R_T1_Code), rlit(R_RA_Code), clb_mem_drain \
)

#define gte_load_v2(r_ptr, base) asm volatile( \
	asm_words( gte_lw_v2(base), gte_lw_v2z(base) ) \
	asm_rpins,   r_use(r_ptr)                                    \
	asm_clobber: rlit(R_V0_Code), rlit(R_T0_Code), rlit(R_T1_Code), rlit(R_RA_Code), clb_mem_drain \
)

/* gte_load_v0v1v2(p0, p1, p2, b0, b1, b2) — the canonical prelude to gte_cmd_rtpt.
 *
 * Loads all three GTE input vectors (6 words) from three separate pointers,
 * one per GTE vector register, each loaded from its own base GPR. Caller
 * must bind each `pN` to `bN` via a register variable.
 *
 *   register V3_S2* p0 rgcc(R_T4) = verts[0].ptr;  // → __asm__("$12")
 *   register V3_S2* p1 rgcc(R_T5) = verts[1].ptr;  // → __asm__("$13")
 *   register V3_S2* p2 rgcc(R_T6) = verts[2].ptr;  // → __asm__("$14")
 *   gte_load_v0v1v2(p0, p1, p2, R_T4, R_T5, R_T6);
 *   gte_rtpt();
 */
#define gte_load_v0v1v2(p0, p1, p2, b0, b1, b2) asm volatile( \
	asm_words( \
		gte_lw_v0(b0), gte_lw_v0z(b0),  \
		gte_lw_v1(b1), gte_lw_v1z(b1),  \
		gte_lw_v2(b2), gte_lw_v2z(b2) ) \
	asm_rpins                             \
		, r_use(p0), r_use(p1), r_use(p2)   \
	asm_clobber: rlit(R_V0_Code), rlit(R_T0_Code), rlit(R_T1_Code), rlit(R_RA_Code), clb_mem_drain \
)

/**
 * @brief Rotate, Translate and Perspective Triple (23 cycles)
 *
 * @details Performs rotation, translation and perspective calculation of three
 * vertices at once. The equation performed is the same as gte_rtps() only
 * repeated three times for each vertex. The result of the first vertex is
 * stored in GTE data register C2_SXY0, the second vector in C2_SXY1 then
 * C2_SXY2.
 *
 * Encoder-style emission (no inline-asm strings in the code body):
 *   1. Two `nop` words fill the COP2 pipeline latency — the GTE
 *      takes ~8 cycles per perspective divide, and the nops let any
 *      preceding lwc2/swc2 retire before RTPT starts reading its
 *      inputs from V0/V1/V2.
 *   2. The RTPT command word itself is `gte_cmdw_rtpt` (see the
 *      pre-baked encoders above) — `0x0280030` decoded as
 *      `op_cop2` | CO(1) | cmd=RTPT, with all SF/MX/V/CV/LM fields
 *      zero (standard rotation, no scaling, V0 vector, translation
 *      vector, no clamp).
 *
 * Clobbers the caller-saved GPRs via `clb_system` (per the kernel
 * ABI) plus the standard "memory" barrier. Does not clobber any COP2
 * data/control register — those have to be saved by the caller if
 * they need to survive across the call (RTPT writes SXY0..2, SZ0..3,
 * OTZ, MAC0..3, IR0..3, etc.).
 */
#define gte_rtpt()                        \
	asm volatile(                           \
		asm_words( nop, nop, gte_cmdw_rtpt )  \
		asm_clobber: clb_system               \
	)

#define gte_rtpt_ori() \
	__asm__ volatile( \
		"nop;"          \
		"nop;"          \
		"cop2 0x0280030;")

/**
 * @brief Normal clipping (8 cycles)
 *
 * @details Computes the sign of three screen coordinates (C2_SXY0-2) used for
 * backface culling. If the value of C2_MAC0 is negative, the coordinates are
 * inverted and thus the triangle is back facing.
 *
 * The following equation is performed when executing this GTE command:
 *
 *     MAC0 = SX0*SY1 + SX1*SY2 + SX2*SY0 - SX0*SY2 - SX1*SY0 - SX2*SY1
 *
 * Encoder-style emission (no inline-asm strings in the code body):
 *   1. Two `nop` words fill the COP2 pipeline latency - the GTE
 *      pipeline takes a few cycles per op, and the nops let any
 *      preceding lwc2/swc2/RTPT retire before NCLIP starts reading
 *      its inputs from SXY0/SXY1/SXY2.
 *   2. The NCLIP command word itself is `gte_cmdw_nclip` (see the
 *      pre-baked encoders above) - `0x01400006` decoded as
 *      `op_cop2` | CO(1) | cmd=NCLIP, with all SF/MX/V/CV/LM fields
 *      zero. NCLIP is spec-clean in the original PsyQ source
 *      (unlike RTPS/RTPT which carry the `gte_cmdw_psyq_compat`
 *      quirk), so `gte_cmdw_nclip` does NOT OR in any reserved bits.
 *
 * Clobbers the caller-saved GPRs via `clb_system` (per the kernel
 * ABI) plus the standard "memory" barrier. Does not clobber any COP2
 * data/control register - those have to be saved by the caller if
 * they need to survive across the call (NCLIP writes MAC0 only; it
 * is purely a sign-of-double-product computation on SXY0..2).
 */
#define gte_nclip()                       \
	asm volatile(                           \
		asm_words( nop, nop, gte_cmdw_nclip ) \
		asm_clobber: clb_system               \
	)

#define gte_stotz(r0) __asm__ volatile("swc2   $7, 0( %0 )" : : "r"(r0) : "memory")

#define gte_stsxy3(r0, r1, r2)  \
	__asm__ volatile(             \
		"swc2   $12, 0( %0 );"      \
		"swc2   $13, 0( %1 );"      \
		"swc2   $14, 0( %2 )"       \
		:                           \
		: "r"(r0), "r"(r1), "r"(r2) \
		: "memory")

#define gte_avsz3() \
	__asm__ volatile( \
		"nop;"          \
		"nop;"          \
		"cop2 0x0158002D;")

/* asm_gte_matrix_set_rotation(r0)
 *
 * Loads the 3x3 rotation matrix at `r0` into the GTE's rotation-matrix
 * control registers (RT11..RT22, indices 0..4) via ctc2.
 *
 * Memory layout at r0: five contiguous 32-bit words (offsets 0..16),
 * each holding two packed 16-bit matrix elements. The first 1.5 rows
 * of a standard PSX SDK MATRIX struct (where each row is laid out as
 * [RT_xx, RT_xy] | [RT_xz, pad] | ...).
 *
 * Generated MIPS (mirrors the source macro):
 *
 *   lw   $12,  0( %0 )    ; word 0
 *   lw   $13,  4( %0 )    ; word 1
 *   ctc2 $12,  $0         ; → C2_RT11
 *   ctc2 $13,  $1         ; → C2_RT12
 *   lw   $12,  8( %0 )    ; word 2
 *   lw   $13, 12( %0 )    ; word 3
 *   lw   $14, 16( %0 )    ; word 4
 *   ctc2 $12,  $2         ; → C2_RT13
 *   ctc2 $13,  $3         ; → C2_RT21
 *   ctc2 $14,  $4         ; → C2_RT22
 *
 * Same contract as gte_load_v0: caller MUST bind `r0` to $12 via a
 * register variable (`rgcc(R_T4)`) for the `lw $12, off(...)`
 * instructions to read from the right base. The `"r"(r0)` constraint
 * alone doesn't force a specific GPR — it just lets GCC pick one.
 * The .word constants here bake R_T4/R_T5/R_T6 into the `rs` field
 * of each lw, so the lw instructions will only do the right thing
 * if $12/$13/$14 hold the matrix base at runtime.
 *
 *   M3_S2* m = ...;
 *   register M3_S2* m_in_12 rgcc(R_T4) = m;
 *   asm_gte_matrix_set_rotation(m_in_12);
 *
 * We clobber $12/$13/$14 (the ones we use as scratch inside the
 * inline asm) plus the system clobbers; we don't clobber `r0` because
 * the `rgcc` binding already says "this variable lives in $12".
 *
 * WARNING: Incomplete by design. The source macro only writes RT11..RT22
 * (5 of 9 rotation elements); RT23 and the entire RT3x row are left
 * untouched. Real libpsn00b SetRotMatrix writes all 9. Use only when the
 * GTE's remaining rotation entries are already correct, or you will
 * get stale-RT2x/RT3x artifacts in RTPS/RTPT/MVMVA output.
 */
#define asm_gte_matrix_set_rotation(r0) \
	asm volatile(                         \
		asm_words(                          \
			  load_word(R_T5, R_T4,  0)       \
			, load_word(R_T6, R_T4,  4)       \
			, gte_mt(   R_T5,        0)       \
			, gte_mt(   R_T6,        1)       \
			, load_word(R_T5, R_T4,  8)       \
			, load_word(R_T6, R_T4, 12)       \
			, load_word(R_T4, R_T4, 16)       \
			, gte_mt(   R_T5,  2)             \
			, gte_mt(   R_T6,  3)             \
			, gte_mt(   R_T4,  4)             \
		)                                   \
		, r_use(r0)                         \
		asm_clobber: clb_system, rlit(R_T4_Code), rlit(R_T5_Code), rlit(R_T6_Code) \
	)