diff --git a/code/duffle/gcc_asm.h b/code/duffle/gcc_asm.h index bb0beb8..e533c5c 100644 --- a/code/duffle/gcc_asm.h +++ b/code/duffle/gcc_asm.h @@ -254,9 +254,63 @@ #define _INL_98(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15,p16,p17,p18,p19,p20,p21,p22,p23,p24,p25,p26,p27,p28,p29,p30,p31,p32,p33,p34,p35,p36,p37,p38,p39,p40,p41,p42,p43,p44,p45,p46,p47,p48,p49,p50,p51,p52,p53,p54,p55,p56,p57,p58,p59,p60,p61,p62,p63,p64,p65,p66,p67,p68,p69,p70,p71,p72,p73,p74,p75,p76,p77,p78,p79,p80,p81,p82,p83,p84,p85,p86,p87,p88,p89,p90,p91,p92,p93,p94,p95,p96,p97) ".word " _STR98 : : _OP90,"i"(p90),"i"(p91),"i"(p92),"i"(p93),"i"(p94),"i"(p95),"i"(p96),"i"(p97) #define _INL_99(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15,p16,p17,p18,p19,p20,p21,p22,p23,p24,p25,p26,p27,p28,p29,p30,p31,p32,p33,p34,p35,p36,p37,p38,p39,p40,p41,p42,p43,p44,p45,p46,p47,p48,p49,p50,p51,p52,p53,p54,p55,p56,p57,p58,p59,p60,p61,p62,p63,p64,p65,p66,p67,p68,p69,p70,p71,p72,p73,p74,p75,p76,p77,p78,p79,p80,p81,p82,p83,p84,p85,p86,p87,p88,p89,p90,p91,p92,p93,p94,p95,p96,p97,p98) ".word " _STR99 : : _OP90,"i"(p90),"i"(p91),"i"(p92),"i"(p93),"i"(p94),"i"(p95),"i"(p96),"i"(p97),"i"(p98) -/* The AST Builders */ -#define asm_clobber(...) : __VA_ARGS__ -#define asm_inline(...) m_expand(glue(_INL_, _ASM_COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__)) +/* ============================================================================ + * AST BUILDERS — assemble a complete inline-asm block + * ============================================================================ + * + * A complete GCC inline-asm statement has up to 4 sections separated by `:`: + * + * asm volatile ( "code" : OUTPUTS : INPUTS : CLOBBERS ); + * + * Every section-builder below prepends the `:` separator that GCC requires, + * so you can compose them inline without thinking about punctuation. The + * master `asm_block(...)` then wraps the four sections in `asm volatile (...)`. + * + * asm_block( + * asm_code( "..." ), + * asm_out ( "=r"(x), "+m"(y) ), // optional + * asm_in ( "r"(a), "m"(b) ), // optional + * asm_clb ( "$8", "memory" ) // optional + * ); + * + * Common idioms (kept for back-compat / terseness): + * + * asm_blob(asm_inline(...), asm_clobber(...)) // 2-section, no I/O + * asm_block(asm_inline(...), , , ) // 4-section, empty + */ -/* The Shell */ +/* `asm_code` is a passthrough — it does NOT prepend a colon, since the code + * section is always the first (no separator needed before it). The format + * string + `"i"(...)` operand list are produced by `asm_inline(...)` and + * just pass through unchanged. */ +#define asm_code(...) __VA_ARGS__ + +/* `asm_out` prepends `:` — separates code/outputs/inputs/clobbers */ +#define asm_out(...) : __VA_ARGS__ +/* `asm_in` prepends `:` */ +#define asm_in(...) : __VA_ARGS__ +/* `asm_clb` prepends `:` */ +#define asm_clb(...) : __VA_ARGS__ + +/* `asm_clobber` is the legacy single-section name. Kept for existing + * call-sites that put inputs *before* clobbers and want both as one colon- + * prefixed block (i.e. the user wrote `: "r"(x) ... : "..."` by hand). */ +#define asm_clobber(...) : __VA_ARGS__ + +/* `asm_inline(...)` dispatches into `_INL_` to emit up to 99 encoded + * instruction words. This is the "compiled-instruction" form of `asm_code`. */ +#define asm_inline(...) m_expand(glue(_INL_, _ASM_COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__)) + +/* `asm_blob(inlines, clobbers)` — the original 2-section shell. Emits + * `asm volatile ( inlines clobbers )` + * which is the `.word`-only shape (no inputs/outputs): the inlines expand + * to `".word %c0, ..." : : "i"(...)` already including the empty output + * and input sections via their trailing `:`, so clobbers just tacks on the + * end. */ #define asm_blob(inlines, clobbers) asm volatile ( inlines clobbers ) + +/* `asm_block(code, outs, ins, clb)` — the full 4-section shell. Each + * argument is expected to already include its own leading `:` (via the + * `asm_out` / `asm_in` / `asm_clb` builders) or be empty. The `code` + * argument should NOT have a leading `:`. */ +#define asm_block(code, outs, ins, clb) asm volatile ( code outs ins clb ) diff --git a/code/duffle/gte.h b/code/duffle/gte.h index 953073b..8b566bf 100644 --- a/code/duffle/gte.h +++ b/code/duffle/gte.h @@ -119,9 +119,22 @@ enum { gte_cr_OFX = 30, gte_cr_OFY = 31, }; -/* COP2 (GTE) Transfer Format - * Opcode is always op_cop2. The 'sub' field determines direction (MT/MF). */ -#define enc_cop2_tx(sub, rt, rd) enc_op(op_cop2) | enc_rs(sub) | enc_rt(rt) | enc_rd(rd) +/* COP2 (GTE) Transfer Format: ctc2 rt, rd or cfc2 rt, rd + * Layout: [op_cop2:6][sub:5][rt:5][rd:5][0:11] + * - sub: cop_mf (0x00) for cfc2, cop_mt (0x04) for ctc2 + * - rt: GPR source/dest + * - rd: COP2 control register index (0..31) */ +#define enc_cop2_tx(sub, rt, rd) (enc_op(op_cop2) | enc_rs(sub) | enc_rt(rt) | enc_rd(rd)) + +/* COP2 Data Load (lwc2): `lwc2 rt, off(rs)` + * Layout: [op_lwc2:6][rs:5][rt:5][imm:16] + * - rs: GPR base address + * - rt: COP2 data register index (0..31) + * - imm: signed 16-bit offset + * NOTE: When `rs` is a runtime register, the encoding cannot be pre-baked + * into a .word — use the string-style `gte_load_v0` macro below instead. */ +#define enc_cop2_lwc2(rt, base, off) enc_i(op_lwc2, (base), (rt), (off)) +#define enc_cop2_swc2(rt, base, off) enc_i(op_swc2, (base), (rt), (off)) /* GTE Command Format (The math engine trigger) * Opcode is always MIPS_OP_COP2, RS is always 1 (CO). @@ -145,7 +158,7 @@ enum { #define enc_gte_cmd(cmd) (((cmd) & gte_mask_cmd) << gte_shift_cmd) /* Composite: all six GTE fields + the COP2/CO base. */ -#define enc_gte_cmd(sf, mx, v, cv, lm, cmd) ( \ +#define enc_gte_cmdw(sf, mx, v, cv, lm, cmd) ( \ gte_cmd_base \ | enc_gte_sf(sf) \ | enc_gte_mx(mx) \ @@ -155,6 +168,26 @@ enum { | enc_gte_cmd(cmd) \ ) +/** + * @brief Loads a single SVECTOR to GTE vector register V0 + * + * @details Loads values from an SVECTOR struct to GTE data registers C2_VXY0 + * (XY at offset 0) and C2_VZ0 (Z at offset 4) using `lwc2`. + * + * Uses string-style GCC inline asm with `%0` substitution because the + * base register `r0` is a runtime GPR chosen by the compiler — it cannot + * be encoded into a static `.word` constant. + * + * Usage: + * asm_gte_load_v0(svector_ptr); + */ +#define asm_gte_load_v0(r0) asm volatile( \ + "lwc2 $0, 0(%0);" \ + "lwc2 $1, 4(%0);" \ + : \ + : "r"(r0) \ +) + /* asm_gte_matrix_set_rotation(r0) * * Loads the 3x3 rotation matrix at `r0` into the GTE's rotation-matrix @@ -178,6 +211,10 @@ enum { * ctc2 $13, $3 ; → C2_RT21 * ctc2 $14, $4 ; → C2_RT22 * + * Uses string-style GCC inline asm with `%0` substitution because the + * base register `r0` is a runtime GPR — the `lw` offsets use literal + * values (0, 4, 8, ...) so only the base register needs substitution. + * * WARNING: Incomplete by design. The source macro only writes RT11..RT22 * (5 of 9 rotation elements); RT23 and the entire RT3x row are left * untouched. Real libpsn00b SetRotMatrix writes all 9. Use only when the @@ -185,7 +222,7 @@ enum { * get stale-RT2x/RT3x artifacts in RTPS/RTPT/MVMVA output. */ #define asm_gte_matrix_set_rotation(r0) \ - asm volatile( \ + asm volatile( \ asm_inline( \ load_imm(R_T4, r0, 0), \ load_imm(R_T5, r0, 4), \ @@ -199,6 +236,6 @@ enum { enc_cop2_tx(cop_mt, R_T6, 4) \ ) \ asm_clobber( clb_system, "$12", "$13", "$14") \ - : \ - : "r"(r0) \ + : \ + : "r"(r0) \ ) diff --git a/code/duffle/mips.h b/code/duffle/mips.h index 4653b9a..7b43f64 100644 --- a/code/duffle/mips.h +++ b/code/duffle/mips.h @@ -59,9 +59,11 @@ enum { , op_lw = 0x23 /* Load Word */ , op_lbu = 0x24 /* Load Byte Unsigned */ , op_lhu = 0x25 /* Load Halfword Unsigned */ + , op_lwc2 = 0x32 /* Load Word to Coprocessor 2 (GTE) */ , op_sb = 0x28 /* Store Byte */ , op_sh = 0x29 /* Store Halfword */ , op_sw = 0x2B /* Store Word */ + , op_swc2 = 0x3A /* Store Word from Coprocessor 2 (GTE) */ , op_load_addr = op_la , op_load_imm = op_li @@ -142,21 +144,55 @@ enum { _BitOffsets = 0 /* MIPS I-Type Instruction Format (Immediate/Constant) */ #define enc_i(op, rs, rt, imm) (enc_op(op) | enc_rs(rs) | enc_rt(rt) | enc_imm(imm)) -/* COP0 (System) Transfer Format */ -#define enc_cop0_tx(sub, rt, rd) enc_op(op_cop0) | enc_rs(sub) | enc_rt(rt) | enc_rd(rd) +/* COP0 (System) Transfer Format: mtc0 rt, rd or mfc0 rt, rd + * `sub` is the COP0 sub-opcode (cop_mf=0 or cop_mt=4), placed in rs slot. + * `rt` is the GPR operand (in rt slot). + * `rd` is the COP0 register index (in rd slot at bits 15..11). */ +#define enc_cop0_tx(sub, rt, rd) enc_i(op_cop0, (sub), (rt), ((rd) << 11)) /* COP0 Return From Exception (rfe) */ #define enc_rfe() 0x42000010 -#define load_imm(rs,rt,imm) enc_i(op_lw, rs, rt, imm) -#define store_word(rs,rt,imm) enc_i(op_sw, rs, rt, imm) -#define add_ui(rs,rt,imm) enc_i(op_addiu, rs, rt, imm) -#define shift_ll(rs,rt,rd) enc_r(op_special, rs, rt, rd, 0, fc_sll) +/* --- Semantic Encoders (MIPS mnemonics) --- + * Argument order matches the MIPS assembly syntax: + * dest-first, then source operands, then immediate last. + * + * load_word(rt, base, off) → lw rt, off(base) + * store_word(rt, base, off) → sw rt, off(base) + * add_ui(rt, rs, imm) → addiu rt, rs, imm + * shift_ll(rd, rt, shamt) → sll rd, rt, shamt + * jump_reg(rs) → jr rs + * jump_link(rs, rd) → jalr rs (link in rd, default $ra) + * nop() → sll $0, $0, 0 + */ +#define load_word(rt, base, off) enc_i(op_lw, (base), (rt), (off)) +#define load_byte(rt, base, off) enc_i(op_lb, (base), (rt), (off)) +#define load_half(rt, base, off) enc_i(op_lh, (base), (rt), (off)) +#define load_byte_u(rt, base, off) enc_i(op_lbu, (base), (rt), (off)) +#define load_half_u(rt, base, off) enc_i(op_lhu, (base), (rt), (off)) +#define store_word(rt, base, off) enc_i(op_sw, (base), (rt), (off)) +#define add_ui(rt, rs, imm) enc_i(op_addiu, (rs), (rt), (imm)) +#define andi_op(rt, rs, imm) enc_i(op_andi, (rs), (rt), (imm)) +#define ori_op(rt, rs, imm) enc_i(op_ori, (rs), (rt), (imm)) +#define xori_op(rt, rs, imm) enc_i(op_xori, (rs), (rt), (imm)) +#define lui_op(rt, imm) enc_i(op_lui, R_0, (rt), (imm)) -#define jump_reg(rs) enc_r(op_special, rs, R_0, R_0, 0, fc_jr) -#define jump_nreg(rs,rt,rd) enc_r(op_special, rs, rt, rd, 0, fc_jalr) +/* Shift family (R-type). shift_ll/lr/ra: `sll rd, rt, shamt` */ +#define shift_ll(rd, rt, shamt) enc_r(op_special, R_0, (rt), (rd), (shamt), fc_sll) +#define shift_lr(rd, rt, shamt) enc_r(op_special, R_0, (rt), (rd), (shamt), fc_srl) +#define shift_ra(rd, rt, shamt) enc_r(op_special, R_0, (rt), (rd), (shamt), fc_sra) -#define nop() shift_ll(rdiscard, rdiscard, rdiscard) +/* jr rs — jump to address in rs. */ +#define jump_reg(rs) enc_r(op_special, (rs), R_0, R_0, 0, fc_jr) + +/* jalr rs, rd — link in rd (default $ra) and jump to address in rs. + * Layout: [op_special][rs:5][rt=0:5][rd:5][shamt=0:5][fc_jalr=0x09] */ +#define jump_link(rs, rd) enc_r(op_special, (rs), R_0, (rd), 0, fc_jalr) + +/* Back-compat alias: the old `load_imm` was a misnomer for `lw`. */ +#define load_imm(rt, base, off) load_word(rt, base, off) + +#define nop() shift_ll(rdiscard, rdiscard, 0) // FI_ void emit_load_imm(U4 rs, U4 rt, U4 imm) { emit(load_imm()); } @@ -178,15 +214,28 @@ enum { bios_table_addr = 0xA0, }; -/* Flushes the Instruction Cache */ +/* Flushes the Instruction Cache (PSX A-function 0x44 via BIOS stub at 0xA0). + * + * Sequence (per MIPS ABI; arguments in arg registers, RA pushed to stack): + * 1. sp -= 8; sw $ra, 4($sp) ; save RA + * 2. $a0 = bios_flushcache (arg0) + * 3. $t0 = bios_table_addr ; t0 = &BIOS A-function table + * 4. jalr $t0, $ra ; call BIOS(flushcache) + * nop ; branch delay slot + * 5. lw $ra, 4($sp); jr $ra ; restore & return + * 6. sp += 8 + */ I_ Code CodeBlob_(mips_flush_icache) { - add_ui(rstack_ptr, rstack_ptr, -8), - store_word(rstack_ptr, rret_addr, 4), - add_ui(rdiscard, rret_0, bios_flushcache), add_ui(rdiscard, rtmp_0, bios_table_addr), - jump_nreg(rtmp_0, rdiscard, rret_addr), - nop(), load_imm(rstack_ptr, rret_addr, 4), jump_reg(rret_addr), - add_ui(rstack_ptr, rstack_ptr, 8) + add_ui(rstack_ptr, rstack_ptr, -8), /* sp -= 8 */ + store_word(rret_addr, rstack_ptr, 4), /* sw $ra, 4($sp) */ + add_ui(rret_0, rdiscard, bios_flushcache), /* addiu $a0, $0, 0x44 */ + add_ui(rtmp_0, rdiscard, bios_table_addr), /* addiu $t0, $0, 0xA0 */ + jump_link(rtmp_0, rret_addr), /* jalr $t0, $ra */ + nop(), /* BD slot */ + load_word(rret_addr, rstack_ptr, 4), /* lw $ra, 4($sp) */ + jump_reg(rret_addr), /* jr $ra */ + add_ui(rstack_ptr, rstack_ptr, 8) /* sp += 8 (BD) */ }; FI_ void mips_flush_icache(void) { C_(VoidFn*, codeblob_mips_flush_icache)(); } @@ -194,12 +243,15 @@ FI_ void mips_flush_icache(void) { C_(VoidFn*, codeblob_mips_flush_icache)(); } #define asm_mips_flush_icache() asm volatile( \ asm_inline( \ - add_ui(rstack_ptr, rstack_ptr, -8) \ - , store_word(rstack_ptr, rret_addr, 4) \ - , add_ui(rdiscard, rret_0, bios_flushcache), add_ui(rdiscard, rtmp_0, bios_table_addr) \ - , jump_nreg(rtmp_0, rdiscard, rret_addr) \ - , nop(), load_imm(rstack_ptr, rret_addr, 4), jump_reg(rret_addr) \ - , add_ui(rstack_ptr, rstack_ptr, 8) \ + add_ui(rstack_ptr, rstack_ptr, -8) \ + , store_word(rret_addr, rstack_ptr, 4) \ + , add_ui(rret_0, rdiscard, bios_flushcache) \ + , add_ui(rtmp_0, rdiscard, bios_table_addr) \ + , jump_link(rtmp_0, rret_addr) \ + , nop() \ + , load_word(rret_addr, rstack_ptr, 4) \ + , jump_reg(rret_addr) \ + , add_ui(rstack_ptr, rstack_ptr, 8) \ ) \ asm_clobber( clb_system ) \ ) diff --git a/code/gte_hello/hello_gte.c b/code/gte_hello/hello_gte.c index 470d2ef..8d1d6c4 100644 --- a/code/gte_hello/hello_gte.c +++ b/code/gte_hello/hello_gte.c @@ -9,6 +9,7 @@ #include "duffle/memory.h" #include "duffle/math.h" #include "duffle/gp.h" +#include "duffle/gte.h" #include "hello_gte.h" enum { @@ -161,26 +162,6 @@ void gp_display_frame(DoubleBuffer* screen_buf, S2* active_buf_id, U4* ordering_ void render(void) { } -// #define gte_ldv0(r0) \ -// __asm__ volatile( \ -// "lwc2 $0, 0( %0 );" \ -// "lwc2 $1, 4( %0 )" \ -// : \ -// : "r"(r0)) - -/** - * @brief Loads a single V3_S2 to GTE vector register V0 - * - * @details Loads values from an V3_S2 struct to GTE data registers C2_VXY0 - * and C2_VZ0. - */ -// #define gte_ldv0( r0 ) __asm__ volatile ( \ -// "lwc2 $0, 0( %0 );" \ -// "lwc2 $1, 4( %0 );" \ -// : \ -// : "r"( r0 ) \ -// : "$t0" ) - void update(PrimitiveArena* pa, U4* ordering_buf) { orderingtbl_clear_reverse(ordering_buf, OrderingTbl_Len); @@ -262,6 +243,8 @@ void update(PrimitiveArena* pa, U4* ordering_buf) V3_S2* p1 = & static_mem.floor.verts[face->y]; V3_S2* p2 = & static_mem.floor.verts[face->z]; + asm_gte_load_v0(p0); + nclip = rtp_avg_nclip_a3_v3s2(p0, p1, p2 , & tri->p0, & tri->p1, & tri->p2 , & p, & orderingtbl_z, & flag