diff --git a/code/duffle/gte.h b/code/duffle/gte.h index 09bc855..09fb8de 100644 --- a/code/duffle/gte.h +++ b/code/duffle/gte.h @@ -246,16 +246,16 @@ enum { #define gte_cr_OFY_Code 31 enum { - gte_cr_RT11 = gte_cr_RT11_Code, gte_cr_RT12 = gte_cr_RT12_Code, gte_cr_RT13 = gte_cr_RT13_Code, - gte_cr_RT21 = gte_cr_RT21_Code, gte_cr_RT22 = gte_cr_RT22_Code, gte_cr_RT23 = gte_cr_RT23_Code, - gte_cr_RT31 = gte_cr_RT31_Code, gte_cr_RT32 = gte_cr_RT32_Code, gte_cr_RT33 = gte_cr_RT33_Code, - gte_cr_TRX = gte_cr_TRX_Code, gte_cr_TRY = gte_cr_TRY_Code, gte_cr_TRZ = gte_cr_TRZ_Code, - gte_cr_L11 = gte_cr_L11_Code, gte_cr_L12 = gte_cr_L12_Code, gte_cr_L13 = gte_cr_L13_Code, - gte_cr_L21 = gte_cr_L21_Code, gte_cr_L22 = gte_cr_L22_Code, gte_cr_L23 = gte_cr_L23_Code, - gte_cr_LR1 = gte_cr_LR1_Code, gte_cr_LR2 = gte_cr_LR2_Code, gte_cr_LR3 = gte_cr_LR3_Code, - gte_cr_RBK = gte_cr_RBK_Code, gte_cr_GBK = gte_cr_GBK_Code, gte_cr_BBK = gte_cr_BBK_Code, - gte_cr_RFC = gte_cr_RFC_Code, gte_cr_GFC = gte_cr_GFC_Code, gte_cr_BFC = gte_cr_BFC_Code, - gte_cr_OFX = gte_cr_OFX_Code, gte_cr_OFY = gte_cr_OFY_Code, + gte_cr_RT11 = gte_cr_RT11_Code, gte_cr_RT12 = gte_cr_RT12_Code, gte_cr_RT13 = gte_cr_RT13_Code, + gte_cr_RT21 = gte_cr_RT21_Code, gte_cr_RT22 = gte_cr_RT22_Code, gte_cr_RT23 = gte_cr_RT23_Code, + gte_cr_RT31 = gte_cr_RT31_Code, gte_cr_RT32 = gte_cr_RT32_Code, gte_cr_RT33 = gte_cr_RT33_Code, + gte_cr_TRX = gte_cr_TRX_Code, gte_cr_TRY = gte_cr_TRY_Code, gte_cr_TRZ = gte_cr_TRZ_Code, + gte_cr_L11 = gte_cr_L11_Code, gte_cr_L12 = gte_cr_L12_Code, gte_cr_L13 = gte_cr_L13_Code, + gte_cr_L21 = gte_cr_L21_Code, gte_cr_L22 = gte_cr_L22_Code, gte_cr_L23 = gte_cr_L23_Code, + gte_cr_LR1 = gte_cr_LR1_Code, gte_cr_LR2 = gte_cr_LR2_Code, gte_cr_LR3 = gte_cr_LR3_Code, + gte_cr_RBK = gte_cr_RBK_Code, gte_cr_GBK = gte_cr_GBK_Code, gte_cr_BBK = gte_cr_BBK_Code, + gte_cr_RFC = gte_cr_RFC_Code, gte_cr_GFC = gte_cr_GFC_Code, gte_cr_BFC = gte_cr_BFC_Code, + gte_cr_OFX = gte_cr_OFX_Code, gte_cr_OFY = gte_cr_OFY_Code, }; enum { _C2_OPS_ = 0 @@ -440,25 +440,25 @@ enum { _C2_OPS_ = 0 * The `asm_clobber(...)` helper from gcc_asm.h prepends the colon that * starts the clobbers section. */ #define gte_load_v0(r_ptr, base) \ - asm volatile( \ - asm_inline( gte_lwc2_v0(base), gte_lwc2_v0z(base) ) \ - , "r"(r_ptr) \ - asm_clobber( reg_str(R_V0_Code), reg_str(R_T0_Code), reg_str(R_T1_Code), reg_str(R_RA_Code), "memory" ) \ - ) + asm volatile( \ + asm_inline( gte_lwc2_v0(base), gte_lwc2_v0z(base) ) \ + , "r"(r_ptr) \ + asm_clobber( reg_str(R_V0_Code), reg_str(R_T0_Code), reg_str(R_T1_Code), reg_str(R_RA_Code), "memory" ) \ + ) #define gte_load_v1(r_ptr, base) \ - asm volatile( \ - asm_inline( gte_lwc2_v1(base), gte_lwc2_v1z(base) ) \ - , "r"(r_ptr) \ - asm_clobber( reg_str(R_V0_Code), reg_str(R_T0_Code), reg_str(R_T1_Code), reg_str(R_RA_Code), "memory" ) \ - ) + asm volatile( \ + asm_inline( gte_lwc2_v1(base), gte_lwc2_v1z(base) ) \ + , "r"(r_ptr) \ + asm_clobber( reg_str(R_V0_Code), reg_str(R_T0_Code), reg_str(R_T1_Code), reg_str(R_RA_Code), "memory" ) \ + ) #define gte_load_v2(r_ptr, base) \ - asm volatile( \ - asm_inline( gte_lwc2_v2(base), gte_lwc2_v2z(base) ) \ - , "r"(r_ptr) \ - asm_clobber( reg_str(R_V0_Code), reg_str(R_T0_Code), reg_str(R_T1_Code), reg_str(R_RA_Code), "memory" ) \ - ) + asm volatile( \ + asm_inline( gte_lwc2_v2(base), gte_lwc2_v2z(base) ) \ + , "r"(r_ptr) \ + asm_clobber( reg_str(R_V0_Code), reg_str(R_T0_Code), reg_str(R_T1_Code), reg_str(R_RA_Code), "memory" ) \ + ) /* gte_load_v0v1v2(p0, p1, p2, b0, b1, b2) — the canonical prelude to gte_cmd_rtpt. * @@ -473,13 +473,13 @@ enum { _C2_OPS_ = 0 * gte_rtpt(); */ #define gte_load_v0v1v2(p0, p1, p2, b0, b1, b2) \ - asm volatile( \ - asm_inline( gte_lwc2_v0(b0), gte_lwc2_v0z(b0), \ - gte_lwc2_v1(b1), gte_lwc2_v1z(b1), \ - gte_lwc2_v2(b2), gte_lwc2_v2z(b2) ) \ - , "r"(p0), "r"(p1), "r"(p2) \ - asm_clobber( reg_str(R_V0_Code), reg_str(R_T0_Code), reg_str(R_T1_Code), reg_str(R_RA_Code), "memory" ) \ - ) + asm volatile( \ + asm_inline( gte_lwc2_v0(b0), gte_lwc2_v0z(b0), \ + gte_lwc2_v1(b1), gte_lwc2_v1z(b1), \ + gte_lwc2_v2(b2), gte_lwc2_v2z(b2) ) \ + , "r"(p0), "r"(p1), "r"(p2) \ + asm_clobber( reg_str(R_V0_Code), reg_str(R_T0_Code), reg_str(R_T1_Code), reg_str(R_RA_Code), "memory" ) \ + ) /** * @brief Rotate, Translate and Perspective Triple (23 cycles) @@ -507,40 +507,69 @@ enum { _C2_OPS_ = 0 * they need to survive across the call (RTPT writes SXY0..2, SZ0..3, * OTZ, MAC0..3, IR0..3, etc.). */ -#define gte_rtpt() \ - asm volatile( \ - asm_inline( nop(), nop(), gte_cmdw_rtpt ) \ - asm_clobber( clb_system ) \ - ) +#define gte_rtpt() \ + asm volatile( \ + asm_inline( nop, nop, gte_cmdw_rtpt ) \ + asm_clobber( clb_system ) \ + ) -#define gte_rtpt_ori() \ - __asm__ volatile( \ - "nop;" \ - "nop;" \ - "cop2 0x0280030;") +#define gte_rtpt_ori() \ + __asm__ volatile( \ + "nop;" \ + "nop;" \ + "cop2 0x0280030;") -#define gte_nclip() \ - __asm__ volatile( \ - "nop;" \ - "nop;" \ - "cop2 0x01400006;") +/** + * @brief Normal clipping (8 cycles) + * + * @details Computes the sign of three screen coordinates (C2_SXY0-2) used for + * backface culling. If the value of C2_MAC0 is negative, the coordinates are + * inverted and thus the triangle is back facing. + * + * The following equation is performed when executing this GTE command: + * + * MAC0 = SX0*SY1 + SX1*SY2 + SX2*SY0 - SX0*SY2 - SX1*SY0 - SX2*SY1 + * + * Encoder-style emission (no inline-asm strings in the code body): + * 1. Two `nop` words fill the COP2 pipeline latency - the GTE + * pipeline takes a few cycles per op, and the nops let any + * preceding lwc2/swc2/RTPT retire before NCLIP starts reading + * its inputs from SXY0/SXY1/SXY2. + * 2. The NCLIP command word itself is `gte_cmdw_nclip` (see the + * pre-baked encoders above) - `0x01400006` decoded as + * `op_cop2` | CO(1) | cmd=NCLIP, with all SF/MX/V/CV/LM fields + * zero. NCLIP is spec-clean in the original PsyQ source + * (unlike RTPS/RTPT which carry the `gte_cmdw_psyq_compat` + * quirk), so `gte_cmdw_nclip` does NOT OR in any reserved bits. + * + * Clobbers the caller-saved GPRs via `clb_system` (per the kernel + * ABI) plus the standard "memory" barrier. Does not clobber any COP2 + * data/control register - those have to be saved by the caller if + * they need to survive across the call (NCLIP writes MAC0 only; it + * is purely a sign-of-double-product computation on SXY0..2). + */ +#define gte_nclip() \ + asm volatile( \ + asm_inline( nop, nop, gte_cmdw_nclip ) \ + asm_clobber( clb_system ) \ + ) #define gte_stotz(r0) __asm__ volatile("swc2 $7, 0( %0 )" : : "r"(r0) : "memory") -#define gte_stsxy3(r0, r1, r2) \ - __asm__ volatile( \ - "swc2 $12, 0( %0 );" \ - "swc2 $13, 0( %1 );" \ - "swc2 $14, 0( %2 )" \ - : \ - : "r"(r0), "r"(r1), "r"(r2) \ - : "memory") +#define gte_stsxy3(r0, r1, r2) \ + __asm__ volatile( \ + "swc2 $12, 0( %0 );" \ + "swc2 $13, 0( %1 );" \ + "swc2 $14, 0( %2 )" \ + : \ + : "r"(r0), "r"(r1), "r"(r2) \ + : "memory") -#define gte_avsz3() \ - __asm__ volatile( \ - "nop;" \ - "nop;" \ - "cop2 0x0158002D;") +#define gte_avsz3() \ + __asm__ volatile( \ + "nop;" \ + "nop;" \ + "cop2 0x0158002D;") /* asm_gte_matrix_set_rotation(r0) * diff --git a/code/duffle/mips.h b/code/duffle/mips.h index 72f662e..8676148 100644 --- a/code/duffle/mips.h +++ b/code/duffle/mips.h @@ -231,7 +231,7 @@ enum { _BitOffsets = 0 * shift_ll(rd, rt, shamt) → sll rd, rt, shamt * jump_reg(rs) → jr rs * jump_link(rs, rd) → jalr rs (link in rd, default $ra) - * nop() → sll $0, $0, 0 + * nop → sll $0, $0, 0 */ #define load_word(rt, base, off) enc_i(op_lw, (base), (rt), (off)) #define load_byte(rt, base, off) enc_i(op_lb, (base), (rt), (off)) @@ -261,10 +261,10 @@ enum { _BitOffsets = 0 #define jump_nreg(rs) jump_link((rs), R_RA) /* j target — absolute jump within the current 256MB region. */ -#define jump(off) enc_i(op_j, R_0, R_0, (off)) +#define jump(off) enc_i(op_j, R_0, R_0, (off)) /* jal target — absolute call within the current 256MB region. */ -#define jump_nlink(off) enc_i(op_jal, R_0, R_0, (off)) +#define jump_nlink(off) enc_i(op_jal, R_0, R_0, (off)) /* --- Store family (mirrors the load family) --- */ #define store_byte(rt, base, off) enc_i(op_sb, (base), (rt), (off)) @@ -297,20 +297,20 @@ enum { _BitOffsets = 0 #define div_u(rd, rs, rt) enc_r(op_special, (rs), (rt), (rd), 0, fc_divu) /* --- Arithmetic I-type (immediate) --- */ -#define add_si(rt, rs, imm) enc_i(op_addi, (rs), (rt), (imm)) +#define add_si(rt, rs, imm) enc_i(op_addi, (rs), (rt), (imm)) /* add_ui already exists above as add_ui */ /* --- Set on less than (R-type and I-type) --- */ #define slt_s(rd, rs, rt) enc_r(op_special, (rs), (rt), (rd), 0, fc_slt) #define slt_u(rd, rs, rt) enc_r(op_special, (rs), (rt), (rd), 0, fc_sltu) -#define slt_si(rt, rs, imm) enc_i(op_slti, (rs), (rt), (imm)) -#define slt_ui(rt, rs, imm) enc_i(op_sltiu, (rs), (rt), (imm)) +#define slt_si(rt, rs, imm) enc_i(op_slti, (rs), (rt), (imm)) +#define slt_ui(rt, rs, imm) enc_i(op_sltiu, (rs), (rt), (imm)) /* --- Move from/to HI/LO (mult/div results) --- */ #define mov_from_high(rd) enc_r(op_special, R_0, R_0, (rd), 0, fc_mfhi) #define mov_from_low(rd) enc_r(op_special, R_0, R_0, (rd), 0, fc_mflo) -#define mov_to_high(rs) enc_r(op_special, (rs), R_0, R_0, 0, fc_mthi) -#define mov_to_low(rs) enc_r(op_special, (rs), R_0, R_0, 0, fc_mtlo) +#define mov_to_high(rs) enc_r(op_special, (rs), R_0, R_0, 0, fc_mthi) +#define mov_to_low(rs) enc_r(op_special, (rs), R_0, R_0, 0, fc_mtlo) /* --- Atomic branches (no pseudos like bgt/bge; compose with slt_* + branch_ne) --- * branch_equal rs, rt, off → beq rs, rt, off @@ -321,21 +321,21 @@ enum { _BitOffsets = 0 * branch_ge_zero rs, off → bgez rs, off * (For `bgez`, the opcode is `op_bcond` with rt=1 to invert the bltz condition.) */ #define branch_equal(rs, rt, off) enc_i(op_beq, (rs), (rt), (off)) -#define branch_ne(rs, rt, off) enc_i(op_bne, (rs), (rt), (off)) -#define branch_lt_zero(rs, off) enc_i(op_bltz, R_0, (rs), (off)) -#define branch_gt_zero(rs, off) enc_i(op_bgtz, R_0, (rs), (off)) -#define branch_le_zero(rs, off) enc_i(op_blez, R_0, (rs), (off)) -#define branch_ge_zero(rs, off) enc_i(op_bcond, R_0, (rs), (1u << 16) | ((off) & 0xFFFF)) +#define branch_ne(rs, rt, off) enc_i(op_bne, (rs), (rt), (off)) +#define branch_lt_zero(rs, off) enc_i(op_bltz, R_0, (rs), (off)) +#define branch_gt_zero(rs, off) enc_i(op_bgtz, R_0, (rs), (off)) +#define branch_le_zero(rs, off) enc_i(op_blez, R_0, (rs), (off)) +#define branch_ge_zero(rs, off) enc_i(op_bcond, R_0, (rs), (1u << 16) | ((off) & 0xFFFF)) /* --- System (kernel) instructions --- */ #define syscall() enc_r(op_special, R_0, R_0, R_0, 0, fc_syscall) #define breakpoint() enc_r(op_special, R_0, R_0, R_0, 0, fc_break) /* --- Shift-amount alias (matches the gas convention `\p3 = shamt`) --- */ -#define shamt(rd, rt, n) shift_ll(rd, rt, n) +#define shamt(rd, rt, n) shift_ll(rd, rt, n) /* nop — canonical sll $0, $0, 0 */ -#define nop() shift_ll(rdiscard, rdiscard, 0) +#define nop shift_ll(rdiscard, rdiscard, 0) #define load_imm_1w(rt, imm) add_ui((rt), R_0, (imm)) #define load_imm_1w_s0(rt, imm) add_si((rt)), R_0, (imm)) @@ -491,7 +491,7 @@ Code CodeBlob_(mips_flush_icache) { add_ui(rret_0, rdiscard, bios_flushcache), /* addiu $a0, $0, 0x44 */ add_ui(rtmp_0, rdiscard, bios_table_addr), /* addiu $t0, $0, 0xA0 */ jump_link(rtmp_0, rret_addr), /* jalr $t0, $ra */ - nop(), /* BD slot */ + nop, /* BD slot */ load_word(rret_addr, rstack_ptr, 4), /* lw $ra, 4($sp) */ jump_reg(rret_addr), /* jr $ra */ add_ui(rstack_ptr, rstack_ptr, 8) /* sp += 8 (BD) */ @@ -511,7 +511,7 @@ FI_ void mips_flush_icache(void) { C_(VoidFn*, codeblob_mips_flush_icache)(); } , add_ui(rret_0, rdiscard, bios_flushcache) \ , add_ui(rtmp_0, rdiscard, bios_table_addr) \ , jump_link(rtmp_0, rret_addr) \ - , nop() \ + , nop \ , load_word(rret_addr, rstack_ptr, 4) \ , jump_reg(rret_addr) \ , add_ui(rstack_ptr, rstack_ptr, 8) \ diff --git a/toolchain/psyq_iwyu b/toolchain/psyq_iwyu new file mode 160000 index 0000000..5cbf9f6 --- /dev/null +++ b/toolchain/psyq_iwyu @@ -0,0 +1 @@ +Subproject commit 5cbf9f68d10490949b43b52846dae8a6383d5c55