gte_nclip docs

This commit is contained in:
2026-06-04 19:40:21 -04:00
parent 8d03366d92
commit 915b14ef31
3 changed files with 107 additions and 77 deletions
+89 -60
View File
@@ -246,16 +246,16 @@ enum {
#define gte_cr_OFY_Code 31
enum {
gte_cr_RT11 = gte_cr_RT11_Code, gte_cr_RT12 = gte_cr_RT12_Code, gte_cr_RT13 = gte_cr_RT13_Code,
gte_cr_RT21 = gte_cr_RT21_Code, gte_cr_RT22 = gte_cr_RT22_Code, gte_cr_RT23 = gte_cr_RT23_Code,
gte_cr_RT31 = gte_cr_RT31_Code, gte_cr_RT32 = gte_cr_RT32_Code, gte_cr_RT33 = gte_cr_RT33_Code,
gte_cr_TRX = gte_cr_TRX_Code, gte_cr_TRY = gte_cr_TRY_Code, gte_cr_TRZ = gte_cr_TRZ_Code,
gte_cr_L11 = gte_cr_L11_Code, gte_cr_L12 = gte_cr_L12_Code, gte_cr_L13 = gte_cr_L13_Code,
gte_cr_L21 = gte_cr_L21_Code, gte_cr_L22 = gte_cr_L22_Code, gte_cr_L23 = gte_cr_L23_Code,
gte_cr_LR1 = gte_cr_LR1_Code, gte_cr_LR2 = gte_cr_LR2_Code, gte_cr_LR3 = gte_cr_LR3_Code,
gte_cr_RBK = gte_cr_RBK_Code, gte_cr_GBK = gte_cr_GBK_Code, gte_cr_BBK = gte_cr_BBK_Code,
gte_cr_RFC = gte_cr_RFC_Code, gte_cr_GFC = gte_cr_GFC_Code, gte_cr_BFC = gte_cr_BFC_Code,
gte_cr_OFX = gte_cr_OFX_Code, gte_cr_OFY = gte_cr_OFY_Code,
gte_cr_RT11 = gte_cr_RT11_Code, gte_cr_RT12 = gte_cr_RT12_Code, gte_cr_RT13 = gte_cr_RT13_Code,
gte_cr_RT21 = gte_cr_RT21_Code, gte_cr_RT22 = gte_cr_RT22_Code, gte_cr_RT23 = gte_cr_RT23_Code,
gte_cr_RT31 = gte_cr_RT31_Code, gte_cr_RT32 = gte_cr_RT32_Code, gte_cr_RT33 = gte_cr_RT33_Code,
gte_cr_TRX = gte_cr_TRX_Code, gte_cr_TRY = gte_cr_TRY_Code, gte_cr_TRZ = gte_cr_TRZ_Code,
gte_cr_L11 = gte_cr_L11_Code, gte_cr_L12 = gte_cr_L12_Code, gte_cr_L13 = gte_cr_L13_Code,
gte_cr_L21 = gte_cr_L21_Code, gte_cr_L22 = gte_cr_L22_Code, gte_cr_L23 = gte_cr_L23_Code,
gte_cr_LR1 = gte_cr_LR1_Code, gte_cr_LR2 = gte_cr_LR2_Code, gte_cr_LR3 = gte_cr_LR3_Code,
gte_cr_RBK = gte_cr_RBK_Code, gte_cr_GBK = gte_cr_GBK_Code, gte_cr_BBK = gte_cr_BBK_Code,
gte_cr_RFC = gte_cr_RFC_Code, gte_cr_GFC = gte_cr_GFC_Code, gte_cr_BFC = gte_cr_BFC_Code,
gte_cr_OFX = gte_cr_OFX_Code, gte_cr_OFY = gte_cr_OFY_Code,
};
enum { _C2_OPS_ = 0
@@ -440,25 +440,25 @@ enum { _C2_OPS_ = 0
* The `asm_clobber(...)` helper from gcc_asm.h prepends the colon that
* starts the clobbers section. */
#define gte_load_v0(r_ptr, base) \
asm volatile( \
asm_inline( gte_lwc2_v0(base), gte_lwc2_v0z(base) ) \
, "r"(r_ptr) \
asm_clobber( reg_str(R_V0_Code), reg_str(R_T0_Code), reg_str(R_T1_Code), reg_str(R_RA_Code), "memory" ) \
)
asm volatile( \
asm_inline( gte_lwc2_v0(base), gte_lwc2_v0z(base) ) \
, "r"(r_ptr) \
asm_clobber( reg_str(R_V0_Code), reg_str(R_T0_Code), reg_str(R_T1_Code), reg_str(R_RA_Code), "memory" ) \
)
#define gte_load_v1(r_ptr, base) \
asm volatile( \
asm_inline( gte_lwc2_v1(base), gte_lwc2_v1z(base) ) \
, "r"(r_ptr) \
asm_clobber( reg_str(R_V0_Code), reg_str(R_T0_Code), reg_str(R_T1_Code), reg_str(R_RA_Code), "memory" ) \
)
asm volatile( \
asm_inline( gte_lwc2_v1(base), gte_lwc2_v1z(base) ) \
, "r"(r_ptr) \
asm_clobber( reg_str(R_V0_Code), reg_str(R_T0_Code), reg_str(R_T1_Code), reg_str(R_RA_Code), "memory" ) \
)
#define gte_load_v2(r_ptr, base) \
asm volatile( \
asm_inline( gte_lwc2_v2(base), gte_lwc2_v2z(base) ) \
, "r"(r_ptr) \
asm_clobber( reg_str(R_V0_Code), reg_str(R_T0_Code), reg_str(R_T1_Code), reg_str(R_RA_Code), "memory" ) \
)
asm volatile( \
asm_inline( gte_lwc2_v2(base), gte_lwc2_v2z(base) ) \
, "r"(r_ptr) \
asm_clobber( reg_str(R_V0_Code), reg_str(R_T0_Code), reg_str(R_T1_Code), reg_str(R_RA_Code), "memory" ) \
)
/* gte_load_v0v1v2(p0, p1, p2, b0, b1, b2) — the canonical prelude to gte_cmd_rtpt.
*
@@ -473,13 +473,13 @@ enum { _C2_OPS_ = 0
* gte_rtpt();
*/
#define gte_load_v0v1v2(p0, p1, p2, b0, b1, b2) \
asm volatile( \
asm_inline( gte_lwc2_v0(b0), gte_lwc2_v0z(b0), \
gte_lwc2_v1(b1), gte_lwc2_v1z(b1), \
gte_lwc2_v2(b2), gte_lwc2_v2z(b2) ) \
, "r"(p0), "r"(p1), "r"(p2) \
asm_clobber( reg_str(R_V0_Code), reg_str(R_T0_Code), reg_str(R_T1_Code), reg_str(R_RA_Code), "memory" ) \
)
asm volatile( \
asm_inline( gte_lwc2_v0(b0), gte_lwc2_v0z(b0), \
gte_lwc2_v1(b1), gte_lwc2_v1z(b1), \
gte_lwc2_v2(b2), gte_lwc2_v2z(b2) ) \
, "r"(p0), "r"(p1), "r"(p2) \
asm_clobber( reg_str(R_V0_Code), reg_str(R_T0_Code), reg_str(R_T1_Code), reg_str(R_RA_Code), "memory" ) \
)
/**
* @brief Rotate, Translate and Perspective Triple (23 cycles)
@@ -507,40 +507,69 @@ enum { _C2_OPS_ = 0
* they need to survive across the call (RTPT writes SXY0..2, SZ0..3,
* OTZ, MAC0..3, IR0..3, etc.).
*/
#define gte_rtpt() \
asm volatile( \
asm_inline( nop(), nop(), gte_cmdw_rtpt ) \
asm_clobber( clb_system ) \
)
#define gte_rtpt() \
asm volatile( \
asm_inline( nop, nop, gte_cmdw_rtpt ) \
asm_clobber( clb_system ) \
)
#define gte_rtpt_ori() \
__asm__ volatile( \
"nop;" \
"nop;" \
"cop2 0x0280030;")
#define gte_rtpt_ori() \
__asm__ volatile( \
"nop;" \
"nop;" \
"cop2 0x0280030;")
#define gte_nclip() \
__asm__ volatile( \
"nop;" \
"nop;" \
"cop2 0x01400006;")
/**
* @brief Normal clipping (8 cycles)
*
* @details Computes the sign of three screen coordinates (C2_SXY0-2) used for
* backface culling. If the value of C2_MAC0 is negative, the coordinates are
* inverted and thus the triangle is back facing.
*
* The following equation is performed when executing this GTE command:
*
* MAC0 = SX0*SY1 + SX1*SY2 + SX2*SY0 - SX0*SY2 - SX1*SY0 - SX2*SY1
*
* Encoder-style emission (no inline-asm strings in the code body):
* 1. Two `nop` words fill the COP2 pipeline latency - the GTE
* pipeline takes a few cycles per op, and the nops let any
* preceding lwc2/swc2/RTPT retire before NCLIP starts reading
* its inputs from SXY0/SXY1/SXY2.
* 2. The NCLIP command word itself is `gte_cmdw_nclip` (see the
* pre-baked encoders above) - `0x01400006` decoded as
* `op_cop2` | CO(1) | cmd=NCLIP, with all SF/MX/V/CV/LM fields
* zero. NCLIP is spec-clean in the original PsyQ source
* (unlike RTPS/RTPT which carry the `gte_cmdw_psyq_compat`
* quirk), so `gte_cmdw_nclip` does NOT OR in any reserved bits.
*
* Clobbers the caller-saved GPRs via `clb_system` (per the kernel
* ABI) plus the standard "memory" barrier. Does not clobber any COP2
* data/control register - those have to be saved by the caller if
* they need to survive across the call (NCLIP writes MAC0 only; it
* is purely a sign-of-double-product computation on SXY0..2).
*/
#define gte_nclip() \
asm volatile( \
asm_inline( nop, nop, gte_cmdw_nclip ) \
asm_clobber( clb_system ) \
)
#define gte_stotz(r0) __asm__ volatile("swc2 $7, 0( %0 )" : : "r"(r0) : "memory")
#define gte_stsxy3(r0, r1, r2) \
__asm__ volatile( \
"swc2 $12, 0( %0 );" \
"swc2 $13, 0( %1 );" \
"swc2 $14, 0( %2 )" \
: \
: "r"(r0), "r"(r1), "r"(r2) \
: "memory")
#define gte_stsxy3(r0, r1, r2) \
__asm__ volatile( \
"swc2 $12, 0( %0 );" \
"swc2 $13, 0( %1 );" \
"swc2 $14, 0( %2 )" \
: \
: "r"(r0), "r"(r1), "r"(r2) \
: "memory")
#define gte_avsz3() \
__asm__ volatile( \
"nop;" \
"nop;" \
"cop2 0x0158002D;")
#define gte_avsz3() \
__asm__ volatile( \
"nop;" \
"nop;" \
"cop2 0x0158002D;")
/* asm_gte_matrix_set_rotation(r0)
*
+17 -17
View File
@@ -231,7 +231,7 @@ enum { _BitOffsets = 0
* shift_ll(rd, rt, shamt) → sll rd, rt, shamt
* jump_reg(rs) → jr rs
* jump_link(rs, rd) → jalr rs (link in rd, default $ra)
* nop() → sll $0, $0, 0
* nop → sll $0, $0, 0
*/
#define load_word(rt, base, off) enc_i(op_lw, (base), (rt), (off))
#define load_byte(rt, base, off) enc_i(op_lb, (base), (rt), (off))
@@ -261,10 +261,10 @@ enum { _BitOffsets = 0
#define jump_nreg(rs) jump_link((rs), R_RA)
/* j target — absolute jump within the current 256MB region. */
#define jump(off) enc_i(op_j, R_0, R_0, (off))
#define jump(off) enc_i(op_j, R_0, R_0, (off))
/* jal target — absolute call within the current 256MB region. */
#define jump_nlink(off) enc_i(op_jal, R_0, R_0, (off))
#define jump_nlink(off) enc_i(op_jal, R_0, R_0, (off))
/* --- Store family (mirrors the load family) --- */
#define store_byte(rt, base, off) enc_i(op_sb, (base), (rt), (off))
@@ -297,20 +297,20 @@ enum { _BitOffsets = 0
#define div_u(rd, rs, rt) enc_r(op_special, (rs), (rt), (rd), 0, fc_divu)
/* --- Arithmetic I-type (immediate) --- */
#define add_si(rt, rs, imm) enc_i(op_addi, (rs), (rt), (imm))
#define add_si(rt, rs, imm) enc_i(op_addi, (rs), (rt), (imm))
/* add_ui already exists above as add_ui */
/* --- Set on less than (R-type and I-type) --- */
#define slt_s(rd, rs, rt) enc_r(op_special, (rs), (rt), (rd), 0, fc_slt)
#define slt_u(rd, rs, rt) enc_r(op_special, (rs), (rt), (rd), 0, fc_sltu)
#define slt_si(rt, rs, imm) enc_i(op_slti, (rs), (rt), (imm))
#define slt_ui(rt, rs, imm) enc_i(op_sltiu, (rs), (rt), (imm))
#define slt_si(rt, rs, imm) enc_i(op_slti, (rs), (rt), (imm))
#define slt_ui(rt, rs, imm) enc_i(op_sltiu, (rs), (rt), (imm))
/* --- Move from/to HI/LO (mult/div results) --- */
#define mov_from_high(rd) enc_r(op_special, R_0, R_0, (rd), 0, fc_mfhi)
#define mov_from_low(rd) enc_r(op_special, R_0, R_0, (rd), 0, fc_mflo)
#define mov_to_high(rs) enc_r(op_special, (rs), R_0, R_0, 0, fc_mthi)
#define mov_to_low(rs) enc_r(op_special, (rs), R_0, R_0, 0, fc_mtlo)
#define mov_to_high(rs) enc_r(op_special, (rs), R_0, R_0, 0, fc_mthi)
#define mov_to_low(rs) enc_r(op_special, (rs), R_0, R_0, 0, fc_mtlo)
/* --- Atomic branches (no pseudos like bgt/bge; compose with slt_* + branch_ne) ---
* branch_equal rs, rt, off → beq rs, rt, off
@@ -321,21 +321,21 @@ enum { _BitOffsets = 0
* branch_ge_zero rs, off → bgez rs, off
* (For `bgez`, the opcode is `op_bcond` with rt=1 to invert the bltz condition.) */
#define branch_equal(rs, rt, off) enc_i(op_beq, (rs), (rt), (off))
#define branch_ne(rs, rt, off) enc_i(op_bne, (rs), (rt), (off))
#define branch_lt_zero(rs, off) enc_i(op_bltz, R_0, (rs), (off))
#define branch_gt_zero(rs, off) enc_i(op_bgtz, R_0, (rs), (off))
#define branch_le_zero(rs, off) enc_i(op_blez, R_0, (rs), (off))
#define branch_ge_zero(rs, off) enc_i(op_bcond, R_0, (rs), (1u << 16) | ((off) & 0xFFFF))
#define branch_ne(rs, rt, off) enc_i(op_bne, (rs), (rt), (off))
#define branch_lt_zero(rs, off) enc_i(op_bltz, R_0, (rs), (off))
#define branch_gt_zero(rs, off) enc_i(op_bgtz, R_0, (rs), (off))
#define branch_le_zero(rs, off) enc_i(op_blez, R_0, (rs), (off))
#define branch_ge_zero(rs, off) enc_i(op_bcond, R_0, (rs), (1u << 16) | ((off) & 0xFFFF))
/* --- System (kernel) instructions --- */
#define syscall() enc_r(op_special, R_0, R_0, R_0, 0, fc_syscall)
#define breakpoint() enc_r(op_special, R_0, R_0, R_0, 0, fc_break)
/* --- Shift-amount alias (matches the gas convention `\p3 = shamt`) --- */
#define shamt(rd, rt, n) shift_ll(rd, rt, n)
#define shamt(rd, rt, n) shift_ll(rd, rt, n)
/* nop — canonical sll $0, $0, 0 */
#define nop() shift_ll(rdiscard, rdiscard, 0)
#define nop shift_ll(rdiscard, rdiscard, 0)
#define load_imm_1w(rt, imm) add_ui((rt), R_0, (imm))
#define load_imm_1w_s0(rt, imm) add_si((rt)), R_0, (imm))
@@ -491,7 +491,7 @@ Code CodeBlob_(mips_flush_icache) {
add_ui(rret_0, rdiscard, bios_flushcache), /* addiu $a0, $0, 0x44 */
add_ui(rtmp_0, rdiscard, bios_table_addr), /* addiu $t0, $0, 0xA0 */
jump_link(rtmp_0, rret_addr), /* jalr $t0, $ra */
nop(), /* BD slot */
nop, /* BD slot */
load_word(rret_addr, rstack_ptr, 4), /* lw $ra, 4($sp) */
jump_reg(rret_addr), /* jr $ra */
add_ui(rstack_ptr, rstack_ptr, 8) /* sp += 8 (BD) */
@@ -511,7 +511,7 @@ FI_ void mips_flush_icache(void) { C_(VoidFn*, codeblob_mips_flush_icache)(); }
, add_ui(rret_0, rdiscard, bios_flushcache) \
, add_ui(rtmp_0, rdiscard, bios_table_addr) \
, jump_link(rtmp_0, rret_addr) \
, nop() \
, nop \
, load_word(rret_addr, rstack_ptr, 4) \
, jump_reg(rret_addr) \
, add_ui(rstack_ptr, rstack_ptr, 8) \
Submodule toolchain/psyq_iwyu added at 5cbf9f68d1