diff --git a/code/duffle/gte.h b/code/duffle/gte.h
index 09bc855..09fb8de 100644
--- a/code/duffle/gte.h
+++ b/code/duffle/gte.h
@@ -246,16 +246,16 @@ enum {
 #define gte_cr_OFY_Code  31
 
 enum {
-    gte_cr_RT11 = gte_cr_RT11_Code, gte_cr_RT12 = gte_cr_RT12_Code, gte_cr_RT13 = gte_cr_RT13_Code,
-    gte_cr_RT21 = gte_cr_RT21_Code, gte_cr_RT22 = gte_cr_RT22_Code, gte_cr_RT23 = gte_cr_RT23_Code,
-    gte_cr_RT31 = gte_cr_RT31_Code, gte_cr_RT32 = gte_cr_RT32_Code, gte_cr_RT33 = gte_cr_RT33_Code,
-    gte_cr_TRX  = gte_cr_TRX_Code,  gte_cr_TRY  = gte_cr_TRY_Code,  gte_cr_TRZ  = gte_cr_TRZ_Code,
-    gte_cr_L11  = gte_cr_L11_Code,  gte_cr_L12  = gte_cr_L12_Code,  gte_cr_L13  = gte_cr_L13_Code,
-    gte_cr_L21  = gte_cr_L21_Code,  gte_cr_L22  = gte_cr_L22_Code,  gte_cr_L23  = gte_cr_L23_Code,
-    gte_cr_LR1  = gte_cr_LR1_Code,  gte_cr_LR2  = gte_cr_LR2_Code,  gte_cr_LR3  = gte_cr_LR3_Code,
-    gte_cr_RBK  = gte_cr_RBK_Code,  gte_cr_GBK  = gte_cr_GBK_Code,  gte_cr_BBK  = gte_cr_BBK_Code,
-    gte_cr_RFC  = gte_cr_RFC_Code,  gte_cr_GFC  = gte_cr_GFC_Code,  gte_cr_BFC  = gte_cr_BFC_Code,
-    gte_cr_OFX  = gte_cr_OFX_Code,  gte_cr_OFY  = gte_cr_OFY_Code,
+	gte_cr_RT11 = gte_cr_RT11_Code, gte_cr_RT12 = gte_cr_RT12_Code, gte_cr_RT13 = gte_cr_RT13_Code,
+	gte_cr_RT21 = gte_cr_RT21_Code, gte_cr_RT22 = gte_cr_RT22_Code, gte_cr_RT23 = gte_cr_RT23_Code,
+	gte_cr_RT31 = gte_cr_RT31_Code, gte_cr_RT32 = gte_cr_RT32_Code, gte_cr_RT33 = gte_cr_RT33_Code,
+	gte_cr_TRX  = gte_cr_TRX_Code,  gte_cr_TRY  = gte_cr_TRY_Code,  gte_cr_TRZ  = gte_cr_TRZ_Code,
+	gte_cr_L11  = gte_cr_L11_Code,  gte_cr_L12  = gte_cr_L12_Code,  gte_cr_L13  = gte_cr_L13_Code,
+	gte_cr_L21  = gte_cr_L21_Code,  gte_cr_L22  = gte_cr_L22_Code,  gte_cr_L23  = gte_cr_L23_Code,
+	gte_cr_LR1  = gte_cr_LR1_Code,  gte_cr_LR2  = gte_cr_LR2_Code,  gte_cr_LR3  = gte_cr_LR3_Code,
+	gte_cr_RBK  = gte_cr_RBK_Code,  gte_cr_GBK  = gte_cr_GBK_Code,  gte_cr_BBK  = gte_cr_BBK_Code,
+	gte_cr_RFC  = gte_cr_RFC_Code,  gte_cr_GFC  = gte_cr_GFC_Code,  gte_cr_BFC  = gte_cr_BFC_Code,
+	gte_cr_OFX  = gte_cr_OFX_Code,  gte_cr_OFY  = gte_cr_OFY_Code,
 };
 
 enum { _C2_OPS_ = 0
@@ -440,25 +440,25 @@ enum { _C2_OPS_ = 0
  * The `asm_clobber(...)` helper from gcc_asm.h prepends the colon that
  * starts the clobbers section. */
 #define gte_load_v0(r_ptr, base) \
-    asm volatile(                                              \
-        asm_inline( gte_lwc2_v0(base), gte_lwc2_v0z(base) )    \
-        , "r"(r_ptr)                                           \
-        asm_clobber( reg_str(R_V0_Code), reg_str(R_T0_Code), reg_str(R_T1_Code), reg_str(R_RA_Code), "memory" )  \
-    )
+	asm volatile(                                            \
+		asm_inline( gte_lwc2_v0(base), gte_lwc2_v0z(base) )    \
+		, "r"(r_ptr)                                           \
+		asm_clobber( reg_str(R_V0_Code), reg_str(R_T0_Code), reg_str(R_T1_Code), reg_str(R_RA_Code), "memory" )  \
+	)
 
 #define gte_load_v1(r_ptr, base) \
-    asm volatile(                                              \
-        asm_inline( gte_lwc2_v1(base), gte_lwc2_v1z(base) )    \
-        , "r"(r_ptr)                                           \
-        asm_clobber( reg_str(R_V0_Code), reg_str(R_T0_Code), reg_str(R_T1_Code), reg_str(R_RA_Code), "memory" )  \
-    )
+	asm volatile(                                            \
+		asm_inline( gte_lwc2_v1(base), gte_lwc2_v1z(base) )    \
+		, "r"(r_ptr)                                           \
+		asm_clobber( reg_str(R_V0_Code), reg_str(R_T0_Code), reg_str(R_T1_Code), reg_str(R_RA_Code), "memory" )  \
+	)
 
 #define gte_load_v2(r_ptr, base) \
-    asm volatile(                                              \
-        asm_inline( gte_lwc2_v2(base), gte_lwc2_v2z(base) )    \
-        , "r"(r_ptr)                                           \
-        asm_clobber( reg_str(R_V0_Code), reg_str(R_T0_Code), reg_str(R_T1_Code), reg_str(R_RA_Code), "memory" )  \
-    )
+	asm volatile(                                            \
+		asm_inline( gte_lwc2_v2(base), gte_lwc2_v2z(base) )    \
+		, "r"(r_ptr)                                           \
+		asm_clobber( reg_str(R_V0_Code), reg_str(R_T0_Code), reg_str(R_T1_Code), reg_str(R_RA_Code), "memory" )  \
+	)
 
 /* gte_load_v0v1v2(p0, p1, p2, b0, b1, b2) — the canonical prelude to gte_cmd_rtpt.
  *
@@ -473,13 +473,13 @@ enum { _C2_OPS_ = 0
  *   gte_rtpt();
  */
 #define gte_load_v0v1v2(p0, p1, p2, b0, b1, b2) \
-    asm volatile(                                              \
-        asm_inline( gte_lwc2_v0(b0), gte_lwc2_v0z(b0),         \
-                    gte_lwc2_v1(b1), gte_lwc2_v1z(b1),         \
-                    gte_lwc2_v2(b2), gte_lwc2_v2z(b2) )        \
-        , "r"(p0), "r"(p1), "r"(p2)                            \
-        asm_clobber( reg_str(R_V0_Code), reg_str(R_T0_Code), reg_str(R_T1_Code), reg_str(R_RA_Code), "memory" )  \
-    )
+	asm volatile(                                            \
+		asm_inline( gte_lwc2_v0(b0), gte_lwc2_v0z(b0),         \
+								gte_lwc2_v1(b1), gte_lwc2_v1z(b1),         \
+								gte_lwc2_v2(b2), gte_lwc2_v2z(b2) )        \
+		, "r"(p0), "r"(p1), "r"(p2)                            \
+		asm_clobber( reg_str(R_V0_Code), reg_str(R_T0_Code), reg_str(R_T1_Code), reg_str(R_RA_Code), "memory" )  \
+	)
 
 /**
  * @brief Rotate, Translate and Perspective Triple (23 cycles)
@@ -507,40 +507,69 @@ enum { _C2_OPS_ = 0
  * they need to survive across the call (RTPT writes SXY0..2, SZ0..3,
  * OTZ, MAC0..3, IR0..3, etc.).
  */
-#define gte_rtpt()                                                          \
-    asm volatile(                                                          \
-        asm_inline( nop(), nop(), gte_cmdw_rtpt )                          \
-        asm_clobber( clb_system )                                          \
-    )
+#define gte_rtpt()                        \
+	asm volatile(                           \
+		asm_inline( nop, nop, gte_cmdw_rtpt ) \
+		asm_clobber( clb_system )             \
+	)
 
-#define gte_rtpt_ori()    \
-    __asm__ volatile( \
-        "nop;"        \
-        "nop;"        \
-        "cop2 0x0280030;")
+#define gte_rtpt_ori() \
+	__asm__ volatile( \
+		"nop;"          \
+		"nop;"          \
+		"cop2 0x0280030;")
 
-#define gte_nclip()   \
-    __asm__ volatile( \
-        "nop;"        \
-        "nop;"        \
-        "cop2 0x01400006;")
+/**
+ * @brief Normal clipping (8 cycles)
+ *
+ * @details Computes the sign of three screen coordinates (C2_SXY0-2) used for
+ * backface culling. If the value of C2_MAC0 is negative, the coordinates are
+ * inverted and thus the triangle is back facing.
+ *
+ * The following equation is performed when executing this GTE command:
+ *
+ *     MAC0 = SX0*SY1 + SX1*SY2 + SX2*SY0 - SX0*SY2 - SX1*SY0 - SX2*SY1
+ *
+ * Encoder-style emission (no inline-asm strings in the code body):
+ *   1. Two `nop` words fill the COP2 pipeline latency - the GTE
+ *      pipeline takes a few cycles per op, and the nops let any
+ *      preceding lwc2/swc2/RTPT retire before NCLIP starts reading
+ *      its inputs from SXY0/SXY1/SXY2.
+ *   2. The NCLIP command word itself is `gte_cmdw_nclip` (see the
+ *      pre-baked encoders above) - `0x01400006` decoded as
+ *      `op_cop2` | CO(1) | cmd=NCLIP, with all SF/MX/V/CV/LM fields
+ *      zero. NCLIP is spec-clean in the original PsyQ source
+ *      (unlike RTPS/RTPT which carry the `gte_cmdw_psyq_compat`
+ *      quirk), so `gte_cmdw_nclip` does NOT OR in any reserved bits.
+ *
+ * Clobbers the caller-saved GPRs via `clb_system` (per the kernel
+ * ABI) plus the standard "memory" barrier. Does not clobber any COP2
+ * data/control register - those have to be saved by the caller if
+ * they need to survive across the call (NCLIP writes MAC0 only; it
+ * is purely a sign-of-double-product computation on SXY0..2).
+ */
+#define gte_nclip()                        \
+	asm volatile(                            \
+		asm_inline( nop, nop, gte_cmdw_nclip ) \
+		asm_clobber( clb_system )              \
+	)
 
 #define gte_stotz(r0) __asm__ volatile("swc2   $7, 0( %0 )" : : "r"(r0) : "memory")
 
-#define gte_stsxy3(r0, r1, r2)      \
-    __asm__ volatile(               \
-        "swc2   $12, 0( %0 );"      \
-        "swc2   $13, 0( %1 );"      \
-        "swc2   $14, 0( %2 )"       \
-        :                           \
-        : "r"(r0), "r"(r1), "r"(r2) \
-        : "memory")
+#define gte_stsxy3(r0, r1, r2)  \
+	__asm__ volatile(             \
+		"swc2   $12, 0( %0 );"      \
+		"swc2   $13, 0( %1 );"      \
+		"swc2   $14, 0( %2 )"       \
+		:                           \
+		: "r"(r0), "r"(r1), "r"(r2) \
+		: "memory")
 
-#define gte_avsz3()   \
-    __asm__ volatile( \
-        "nop;"        \
-        "nop;"        \
-        "cop2 0x0158002D;")
+#define gte_avsz3() \
+	__asm__ volatile( \
+		"nop;"          \
+		"nop;"          \
+		"cop2 0x0158002D;")
 
 /* asm_gte_matrix_set_rotation(r0)
  *
diff --git a/code/duffle/mips.h b/code/duffle/mips.h
index 72f662e..8676148 100644
--- a/code/duffle/mips.h
+++ b/code/duffle/mips.h
@@ -231,7 +231,7 @@ enum { _BitOffsets = 0
  *   shift_ll(rd, rt, shamt)    → sll   rd, rt, shamt
  *   jump_reg(rs)               → jr    rs
  *   jump_link(rs, rd)          → jalr  rs        (link in rd, default $ra)
- *   nop()                      → sll   $0, $0, 0
+ *   nop                      → sll   $0, $0, 0
  */
 #define load_word(rt, base, off)   enc_i(op_lw,    (base), (rt), (off))
 #define load_byte(rt, base, off)   enc_i(op_lb,    (base), (rt), (off))
@@ -261,10 +261,10 @@ enum { _BitOffsets = 0
 #define jump_nreg(rs)              jump_link((rs), R_RA)
 
 /* j target — absolute jump within the current 256MB region. */
-#define jump(off)                   enc_i(op_j,     R_0, R_0, (off))
+#define jump(off)                  enc_i(op_j,     R_0, R_0, (off))
 
 /* jal target — absolute call within the current 256MB region. */
-#define jump_nlink(off)             enc_i(op_jal,   R_0, R_0, (off))
+#define jump_nlink(off)            enc_i(op_jal,   R_0, R_0, (off))
 
 /* --- Store family (mirrors the load family) --- */
 #define store_byte(rt, base, off)  enc_i(op_sb,    (base), (rt), (off))
@@ -297,20 +297,20 @@ enum { _BitOffsets = 0
 #define div_u(rd, rs, rt)          enc_r(op_special, (rs), (rt), (rd), 0, fc_divu)
 
 /* --- Arithmetic I-type (immediate) --- */
-#define add_si(rt, rs, imm)         enc_i(op_addi,  (rs), (rt), (imm))
+#define add_si(rt, rs, imm)        enc_i(op_addi,  (rs), (rt), (imm))
 /* add_ui already exists above as add_ui */
 
 /* --- Set on less than (R-type and I-type) --- */
 #define slt_s(rd, rs, rt)          enc_r(op_special, (rs), (rt), (rd), 0, fc_slt)
 #define slt_u(rd, rs, rt)          enc_r(op_special, (rs), (rt), (rd), 0, fc_sltu)
-#define slt_si(rt, rs, imm)         enc_i(op_slti,  (rs), (rt), (imm))
-#define slt_ui(rt, rs, imm)         enc_i(op_sltiu, (rs), (rt), (imm))
+#define slt_si(rt, rs, imm)        enc_i(op_slti,  (rs), (rt), (imm))
+#define slt_ui(rt, rs, imm)        enc_i(op_sltiu, (rs), (rt), (imm))
 
 /* --- Move from/to HI/LO (mult/div results) --- */
 #define mov_from_high(rd)          enc_r(op_special, R_0, R_0, (rd), 0, fc_mfhi)
 #define mov_from_low(rd)           enc_r(op_special, R_0, R_0, (rd), 0, fc_mflo)
-#define mov_to_high(rs)             enc_r(op_special, (rs), R_0, R_0, 0, fc_mthi)
-#define mov_to_low(rs)              enc_r(op_special, (rs), R_0, R_0, 0, fc_mtlo)
+#define mov_to_high(rs)            enc_r(op_special, (rs), R_0, R_0, 0, fc_mthi)
+#define mov_to_low(rs)             enc_r(op_special, (rs), R_0, R_0, 0, fc_mtlo)
 
 /* --- Atomic branches (no pseudos like bgt/bge; compose with slt_* + branch_ne) ---
  * branch_equal  rs, rt, off → beq   rs, rt, off
@@ -321,21 +321,21 @@ enum { _BitOffsets = 0
  * branch_ge_zero rs, off     → bgez  rs, off
  * (For `bgez`, the opcode is `op_bcond` with rt=1 to invert the bltz condition.) */
 #define branch_equal(rs, rt, off)   enc_i(op_beq,   (rs), (rt), (off))
-#define branch_ne(rs, rt, off)       enc_i(op_bne,   (rs), (rt), (off))
-#define branch_lt_zero(rs, off)      enc_i(op_bltz,  R_0, (rs), (off))
-#define branch_gt_zero(rs, off)      enc_i(op_bgtz,  R_0, (rs), (off))
-#define branch_le_zero(rs, off)      enc_i(op_blez,  R_0, (rs), (off))
-#define branch_ge_zero(rs, off)      enc_i(op_bcond, R_0, (rs), (1u << 16) | ((off) & 0xFFFF))
+#define branch_ne(rs, rt, off)      enc_i(op_bne,   (rs), (rt), (off))
+#define branch_lt_zero(rs, off)     enc_i(op_bltz,  R_0, (rs), (off))
+#define branch_gt_zero(rs, off)     enc_i(op_bgtz,  R_0, (rs), (off))
+#define branch_le_zero(rs, off)     enc_i(op_blez,  R_0, (rs), (off))
+#define branch_ge_zero(rs, off)     enc_i(op_bcond, R_0, (rs), (1u << 16) | ((off) & 0xFFFF))
 
 /* --- System (kernel) instructions --- */
 #define syscall()                   enc_r(op_special, R_0, R_0, R_0, 0, fc_syscall)
 #define breakpoint()                enc_r(op_special, R_0, R_0, R_0, 0, fc_break)
 
 /* --- Shift-amount alias (matches the gas convention `\p3 = shamt`) --- */
-#define shamt(rd, rt, n)           shift_ll(rd, rt, n)
+#define shamt(rd, rt, n)            shift_ll(rd, rt, n)
 
 /* nop — canonical sll $0, $0, 0 */
-#define nop() shift_ll(rdiscard, rdiscard, 0)
+#define nop shift_ll(rdiscard, rdiscard, 0)
 
 #define load_imm_1w(rt, imm)    add_ui((rt),  R_0, (imm))
 #define load_imm_1w_s0(rt, imm) add_si((rt)), R_0, (imm))
@@ -491,7 +491,7 @@ Code CodeBlob_(mips_flush_icache) {
 	add_ui(rret_0, rdiscard, bios_flushcache), /* addiu $a0, $0, 0x44 */
 	add_ui(rtmp_0, rdiscard, bios_table_addr), /* addiu $t0, $0, 0xA0 */
 	jump_link(rtmp_0, rret_addr),              /* jalr $t0, $ra   */
-	nop(),                                     /* BD slot         */
+	nop,                                       /* BD slot         */
 	load_word(rret_addr, rstack_ptr, 4),       /* lw  $ra, 4($sp) */
 	jump_reg(rret_addr),                       /* jr   $ra        */
 	add_ui(rstack_ptr, rstack_ptr, 8)          /* sp += 8 (BD)    */
@@ -511,7 +511,7 @@ FI_ void mips_flush_icache(void) { C_(VoidFn*, codeblob_mips_flush_icache)(); }
 	, add_ui(rret_0, rdiscard, bios_flushcache) \
 	, add_ui(rtmp_0, rdiscard, bios_table_addr) \
 	, jump_link(rtmp_0, rret_addr)              \
-	, nop()                                     \
+	, nop                                       \
 	, load_word(rret_addr, rstack_ptr, 4)       \
 	, jump_reg(rret_addr)                       \
 	, add_ui(rstack_ptr, rstack_ptr, 8)         \
diff --git a/toolchain/psyq_iwyu b/toolchain/psyq_iwyu
new file mode 160000
index 0000000..5cbf9f6
--- /dev/null
+++ b/toolchain/psyq_iwyu
@@ -0,0 +1 @@
+Subproject commit 5cbf9f68d10490949b43b52846dae8a6383d5c55