#ifdef INTELLISENSE_DIRECTIVES
#	pragma once
#	include "dsl.h"
#	include "math.h"
#	include "mips.h"
#endif

/* C2 data registers */

/* --- GTE Data Registers (Coprocessor 2) --- */
enum {
	C2_VXY0 = 0,  C2_VZ0  = 1,  C2_VXY1 = 2,  C2_VZ1  = 3,
	C2_VXY2 = 4,  C2_VZ2  = 5,  C2_RGB  = 6,  C2_OTZ  = 7,
	C2_IR0  = 8,  C2_IR1  = 9,  C2_IR2  = 10, C2_IR3  = 11,
	C2_SXY0 = 12, C2_SXY1 = 13, C2_SXY2 = 14, C2_SXYP = 15,
	C2_SZ0  = 16, C2_SZ1  = 17, C2_SZ2  = 18, C2_SZ3  = 19,
	C2_RGB0 = 20, C2_RGB1 = 21, C2_RGB2 = 22, C2_RES1 = 23,
	C2_MAC0 = 24, C2_MAC1 = 25, C2_MAC2 = 26, C2_MAC3 = 27,
	C2_IRGB = 28, C2_ORGB = 29, C2_LZCS = 30, C2_LZCR = 31
};

/* Semantic Aliases for GTE Data Registers */
enum {
	gte_in_v0_xy     = C2_VXY0, /* Input Vector 0 (X, Y) */
	gte_in_v0_z      = C2_VZ0,  /* Input Vector 0 (Z) */
	gte_in_v1_xy     = C2_VXY1, /* Input Vector 1 (X, Y) */
	gte_in_v1_z      = C2_VZ1,  /* Input Vector 1 (Z) */
	gte_in_v2_xy     = C2_VXY2, /* Input Vector 2 (X, Y) */
	gte_in_v2_z      = C2_VZ2,  /* Input Vector 2 (Z) */
	gte_in_rgb       = C2_RGB,  /* Input Color (R, G, B, Code) */
	gte_out_scr_xy0  = C2_SXY0, /* Output Screen Coord 0 (X, Y) */
	gte_out_scr_xy1  = C2_SXY1, /* Output Screen Coord 1 (X, Y) */
	gte_out_scr_xy2  = C2_SXY2, /* Output Screen Coord 2 (X, Y) */
	gte_out_depth    = C2_OTZ,  /* Output Ordering Table Z (Depth) */
	gte_math_accum0  = C2_MAC0, /* Math Accumulator 0 */
	gte_math_accum1  = C2_MAC1, /* Math Accumulator 1 */
	gte_math_accum2  = C2_MAC2, /* Math Accumulator 2 */
};

/* --- GTE Command Semantics (The Bitfield Meanings) --- 
 * A GTE command is a single 32-bit word sent to COP2.
 * It is highly configurable via bitfields.
 */

enum {
/* Shift Fraction (Bit 19) - Determines fixed-point division */

	gte_sf_fractional = 0,  /* Divide result by 4096 (Standard 4.12 fixed point) */
	gte_sf_integer    = 1,  /* No division (Raw integer math) */

/* Matrix Select (Bits 18-17) - Which 3x3 matrix to multiply by */

	gte_mx_rotation   = 0,  /* Rotation Matrix (RT) */
	gte_mx_light      = 1,  /* Light Matrix (LL) */
	gte_mx_color      = 2,  /* Color Matrix (LC) */
	gte_mx_none       = 3,  /* Reserved / Do not multiply */

/* Vector select (Bits 16-15) - Which input vector to use */

	gte_v_v0        = 0,  /* Use Vector 0 (VXY0, VZ0) */
	gte_v_v1        = 1,  /* Use Vector 1 (VXY1, VZ1) */
	gte_v_v2        = 2,  /* Use Vector 2 (VXY2, VZ2) */
	gte_v_ir_regs   = 3,  /* Use Intermediate Registers (IR1, IR2, IR3) */

/* Control Vector Select (Bits 14-13) - Which vector to ADD after multiplication */

	gte_cv_translate  = 0,  /* Add Translation Vector (TRX, TRY, TRZ) */
	gte_cv_bg_color   = 1,  /* Add Background Color (RBK, GBK, BBK) */
	gte_cv_far_color  = 2,  /* Add Far Color (RFC, GFC, BFC) */
	gte_cv_none       = 3,  /* Add Zero (No addition) */

/* Limit/Clamp (Bit 10) - Prevents overflow artifacts */

	gte_lm_normal     = 0,  /* Normal math (can overflow) */
	gte_lm_clamp      = 1,  /* Clamp results to valid hardware ranges (e.g., RGB 0-255) */

/* Core Command IDs (Bits 5-0) */

	gte_cmd_rtps      = 0x01, /* Rot/Trans Perspective Single (1 vertex) */
	gte_cmd_rtpt      = 0x02, /* Rot/Trans Perspective Triple (3 vertices) */
	gte_cmd_nclip     = 0x06, /* Normal Clipping (Backface culling) */
	gte_cmd_op        = 0x0C, /* Outer Product */
	gte_cmd_mvmva     = 0x12, /* Matrix Vector Multiply & Add (Custom math) */

/* --- GTE Command Bit-Field Layout ---
 * A GTE command word (sent to COP2 with RS=1) is laid out as:
 *
 *   31........25 24 23..19 18..17 16..15 14..13 12..11 10  9.......6  5.......0
 *   +------------+--+-----+------+------+------+------+---+--------+----------+
 *   |  0x3E (COP2)| 1|  -- |  sf  |  mx  |  v   |  cv  | --|  lm  |  -- |  cmd  |
 *   +------------+--+-----+------+------+------+------+---+--------+----------+
 *                                    \_____ GTE_PAYLOAD _____/       \__ GTE_CMD __/
 *
 * Shifts/masks below are the *bit positions* and *bit widths* of each
 * configurable field, used by the ENC_GTE_CMD encoder. Mirrors the
 * OPCODE_SHIFT / RS_SHIFT convention used in mips.h.
 */

	gte_shift_sf  = 19,  gte_width_sf  = 1,  gte_mask_sf  = 0x1,
	gte_shift_mx  = 17,  gte_width_mx  = 2,  gte_mask_mx  = 0x3,
	gte_shift_v   = 15,  gte_width_v   = 2,  gte_mask_v   = 0x3,
	gte_shift_cv  = 13,  gte_width_cv  = 2,  gte_mask_cv  = 0x3,
	gte_shift_lm  = 10,  gte_width_lm  = 1,  gte_mask_lm  = 0x1,
	gte_shift_cmd =  0,  gte_width_cmd = 6,  gte_mask_cmd = 0x3F,
};

/* --- GTE Control Register Indices (for ctc2/cfc2) --- */

enum {
    gte_cr_RT11 = 0,  gte_cr_RT12 = 1,  gte_cr_RT13 = 2,
    gte_cr_RT21 = 3,  gte_cr_RT22 = 4,  gte_cr_RT23 = 5,
    gte_cr_RT31 = 6,  gte_cr_RT32 = 7,  gte_cr_RT33 = 8,
    gte_cr_TRX  = 9,  gte_cr_TRY  = 10, gte_cr_TRZ  = 11,
    gte_cr_L11  = 12, gte_cr_L12  = 13, gte_cr_L13  = 14,
    gte_cr_L21  = 15, gte_cr_L22  = 16, gte_cr_L23  = 17,
    gte_cr_LR1  = 18, gte_cr_LR2  = 19, gte_cr_LR3  = 20,
    gte_cr_RBK  = 24, gte_cr_GBK  = 25, gte_cr_BBK  = 26,
    gte_cr_RFC  = 27, gte_cr_GFC  = 28, gte_cr_BFC  = 29,
    gte_cr_OFX  = 30, gte_cr_OFY  = 31,
};

/* COP2 (GTE) Transfer Format
 * Opcode is always op_cop2. The 'sub' field determines direction (MT/MF). */
#define enc_cop2_tx(sub, rt, rd) enc_op(op_cop2) | enc_rs(sub) | enc_rt(rt) | enc_rd(rd)

/* GTE Command Format (The math engine trigger)
 * Opcode is always MIPS_OP_COP2, RS is always 1 (CO).
 * The lower 25 bits are the GTE-specific command payload.
 *
 * The granular `enc_gte_<field>(x)` macros below mirror the `enc_op`/`enc_rs`
 * pattern in mips.h: each one self-masks and shifts its own field, so a
 * caller can build up a GTE command piece by piece (handy for state-driven
 * MVMVA emitters that vary one field at a time).
 *
 * `ENC_GTE_CMD` is the all-in-one convenience for emitting a full command
 * word in one go. It just ORs the per-field encoders together. */
#define gte_cmd_base (enc_op(op_cop2) | (1 << 25))

/* Per-field encoders. Each one does (value & mask) << shift on its own. */
#define enc_gte_sf(sf)       (((sf)  & gte_mask_sf ) << gte_shift_sf )
#define enc_gte_mx(mx)       (((mx)  & gte_mask_mx ) << gte_shift_mx )
#define enc_gte_v(v)         (((v)   & gte_mask_v  ) << gte_shift_v  )
#define enc_gte_cv(cv)       (((cv)  & gte_mask_cv ) << gte_shift_cv )
#define enc_gte_lm(lm)       (((lm)  & gte_mask_lm ) << gte_shift_lm )
#define enc_gte_cmd(cmd)     (((cmd) & gte_mask_cmd) << gte_shift_cmd)

/* Composite: all six GTE fields + the COP2/CO base. */
#define enc_gte_cmd(sf, mx, v, cv, lm, cmd) ( \
     gte_cmd_base     \
   | enc_gte_sf(sf)   \
   | enc_gte_mx(mx)   \
   | enc_gte_v(v)     \
   | enc_gte_cv(cv)   \
   | enc_gte_lm(lm)   \
   | enc_gte_cmd(cmd) \
)

/* asm_gte_matrix_set_rotation(r0)
 *
 * Loads the 3x3 rotation matrix at `r0` into the GTE's rotation-matrix
 * control registers (RT11..RT22, indices 0..4) via ctc2.
 *
 * Memory layout at r0: five contiguous 32-bit words (offsets 0..16),
 * each holding two packed 16-bit matrix elements. The first 1.5 rows
 * of a standard PSX SDK MATRIX struct (where each row is laid out as
 * [RT_xx, RT_xy] | [RT_xz, pad] | ...).
 *
 * Generated MIPS (mirrors the source macro):
 *
 *   lw   $12,  0( %0 )    ; word 0
 *   lw   $13,  4( %0 )    ; word 1
 *   ctc2 $12,  $0         ; → C2_RT11
 *   ctc2 $13,  $1         ; → C2_RT12
 *   lw   $12,  8( %0 )    ; word 2
 *   lw   $13, 12( %0 )    ; word 3
 *   lw   $14, 16( %0 )    ; word 4
 *   ctc2 $12,  $2         ; → C2_RT13
 *   ctc2 $13,  $3         ; → C2_RT21
 *   ctc2 $14,  $4         ; → C2_RT22
 *
 * WARNING: Incomplete by design. The source macro only writes RT11..RT22
 * (5 of 9 rotation elements); RT23 and the entire RT3x row are left
 * untouched. Real libpsn00b SetRotMatrix writes all 9. Use only when the
 * GTE's remaining rotation entries are already correct, or you will
 * get stale-RT2x/RT3x artifacts in RTPS/RTPT/MVMVA output.
 */
#define asm_gte_matrix_set_rotation(r0) \
	asm volatile(                         \
		asm_inline(                         \
			 load_imm(R_T4, r0,  0),          \
			 load_imm(R_T5, r0,  4),          \
			 enc_cop2_tx(cop_mt, R_T4,  0),   \
			 enc_cop2_tx(cop_mt, R_T5,  1),   \
			 load_imm(R_T4, r0,  8),          \
			 load_imm(R_T5, r0, 12),          \
			 load_imm(R_T6, r0, 16),          \
			 enc_cop2_tx(cop_mt, R_T4,  2),   \
			 enc_cop2_tx(cop_mt, R_T5,  3),   \
			 enc_cop2_tx(cop_mt, R_T6,  4)    \
		)                                   \
		asm_clobber( clb_system, "$12", "$13", "$14") \
		:                                             \
		: "r"(r0)                                     \
	)