WIP: trying to see if I can get this tape execution working

This commit is contained in:
2026-06-14 18:38:38 -04:00
parent 2c3d0c4af7
commit 1b77d8bae3
4 changed files with 205 additions and 9 deletions
+8 -2
View File
@@ -271,8 +271,14 @@ enum { _C2_OPS_ = 0
* - rd: COP2 control register index (0..31) */
#define enc_gte_tx(sub, rt, rd) (enc_op(op_cop2) | enc_rs(sub) | enc_rt(rt) | enc_rd(rd))
#define gte_mt(rt, rd) enc_gte_tx(cop_mt, (rt), (rd)) /* Move GPR (rt) to GTE Control Register (rd) */
#define gte_mf(rt, rd) enc_gte_tx(cop_mf, (rt), (rd)) /* Move GTE Control Register (rd) to GPR (rt) */
// #define gte_mt(rt, rd) enc_gte_tx(cop_mt, (rt), (rd)) /* Move GPR (rt) to GTE Control Register (rd) */
// #define gte_mf(rt, rd) enc_gte_tx(cop_mf, (rt), (rd)) /* Move GTE Control Register (rd) to GPR (rt) */
/* Explicit GTE Data vs Control Register Transfers */
#define gte_mf(rt, rd) enc_gte_tx(0x00, (rt), (rd)) /* Move from GTE Data Reg (e.g. MAC0, OTZ) */
#define gte_cf(rt, rd) enc_gte_tx(0x02, (rt), (rd)) /* Move from GTE Control Reg */
#define gte_mt(rt, rd) enc_gte_tx(0x04, (rt), (rd)) /* Move to GTE Data Reg (e.g. VXY0) */
#define gte_ct(rt, rd) enc_gte_tx(0x06, (rt), (rd)) /* Move to GTE Control Reg (e.g. Matrices) */
/* COP2 Data Load (lwc2): `lwc2 rt, off(rs)`
* Layout: [op_lwc2:6][rs:5][rt:5][imm:16]
+148
View File
@@ -0,0 +1,148 @@
#ifdef INTELLISENSE_DIRECTIVES
# pragma once
# include "dsl.h"
# include "gcc_asm.h"
# include "mips.h"
# include "gte.h"
# include "memory.h"
#endif
/* R_T8 is our dedicated Tape Pointer (TP) */
#define R_TP R_T8
#define R_TP_Code R_T8_Code
/* The 'Yield' sequence for CodeBlobs */
#define mips_yield \
load_word(R_T9, R_TP, 0) \
, add_ui(R_TP, R_TP, 4) \
, jump_reg(R_T9) \
, nop
/* The 'Exit' Atom */
internal Code CodeBlob_(tape_exit) { jump_reg(rret_addr), nop };
typedef Slice_(U4);
FI_ void tape_run(Slice_U4 tape, B1** r_prim_cursor, void* face_cursor, void* vert_base, void* ot_base) {
register U4* tp rgcc(R_TP) = tape.ptr;
register B1* pcur rgcc(R_T7) = *r_prim_cursor;
register void* r_t4 rgcc(R_T4) = face_cursor;
register void* r_t5 rgcc(R_T5) = vert_base;
register void* r_t6 rgcc(R_T6) = ot_base;
asm volatile(
"lw $25, 0(%0);"
"addiu %0, %0, 4;"
"jalr $25;"
"nop;"
: "+r"(tp), "+r"(pcur), "+r"(r_t4), "+r"(r_t5), "+r"(r_t6)
:
: "at", "v0", "v1", "t0", "t1", "t2", "t3", "t9", "ra", "memory"
);
*r_prim_cursor = pcur;
}
typedef Struct_(TapeBuilder) {FArena* arena;};
FI_ TapeBuilder tb_begin(FArena* arena) {return (TapeBuilder){ arena };}
I_ void tb_emit(TapeBuilder* tb, Code* atom) {
U4* slot = farena_push_type(tb->arena, U4);
slot[0] = (U4)atom;
}
I_ Slice_U4 tb_end(TapeBuilder* tb) {
tb_emit(tb, code_tape_exit);
return (Slice_U4){ (U4*)tb->arena->start, tb->arena->used / 4 };
}
internal Code CodeBlob_(atom_set_gte_world) {
/* Pop matrix address from tape into R_T3 ($11) */
load_word(R_T3, R_TP, 0),
add_ui(R_TP, R_TP, 4),
/* Load 3x3 Rotation + 3x1 Translation from R_T3 into GTE CONTROL Regs (ctc2) */
load_word(R_T0, R_T3, 0), load_word(R_T1, R_T3, 4),
gte_ct(R_T0, gte_cr_RT11), gte_ct(R_T1, gte_cr_RT12),
load_word(R_T0, R_T3, 8), load_word(R_T1, R_T3, 12), load_word(R_T2, R_T3, 16),
gte_ct(R_T0, gte_cr_RT13), gte_ct(R_T1, gte_cr_RT21), gte_ct(R_T2, gte_cr_RT22),
load_word(R_T0, R_T3, 20), load_word(R_T1, R_T3, 24), load_word(R_T2, R_T3, 28),
gte_ct(R_T0, gte_cr_TRX), gte_ct(R_T1, gte_cr_TRY), gte_ct(R_T2, gte_cr_TRZ),
mips_yield
};
internal Code CodeBlob_(atom_floor_tri) {
/* 1. Load 3 indices from $t4 */
load_half_u(R_T0, R_T4, 0),
load_half_u(R_T1, R_T4, 2),
load_half_u(R_T2, R_T4, 4),
/* 2. Load Vertices: Addr = Base + (idx * 8). Write to GTE DATA Regs (mtc2) */
shift_ll(R_AT, R_T0, 3), add_u(R_AT, R_AT, R_T5),
load_word(R_V0, R_AT, 0), load_word(R_V1, R_AT, 4),
gte_mt(R_V0, C2_VXY0), gte_mt(R_V1, C2_VZ0),
shift_ll(R_AT, R_T1, 3), add_u(R_AT, R_AT, R_T5),
load_word(R_V0, R_AT, 0), load_word(R_V1, R_AT, 4),
gte_mt(R_V0, C2_VXY1), gte_mt(R_V1, C2_VZ1),
shift_ll(R_AT, R_T2, 3), add_u(R_AT, R_AT, R_T5),
load_word(R_V0, R_AT, 0), load_word(R_V1, R_AT, 4),
gte_mt(R_V0, C2_VXY2), gte_mt(R_V1, C2_VZ2),
/* 3. RTPT + NCLIP */
nop, nop, gte_cmdw_rtpt,
nop, nop, gte_cmdw_nclip,
nop, nop, /* Wait for NCLIP to finish */
/* 4. Check NCLIP.
If MAC0 <= 0 (Backface), branch to end.
Target is 28 instructions past the delay slot. */
gte_mf(R_T0, C2_MAC0),
branch_le_zero(R_T0, 28),
nop, /* <--- DELAY SLOT (Index 0) */
/* 5. Store Primitive Data */
/* 1 */ store_word(R_0, R_T7, 0),
/* 2 */ load_ui(R_AT, 0x20FF), /* High: Code 0x20 + Color R:FF */
/* 3 */ or_i(R_AT, R_AT, 0xFFFF), /* Low: Color G:FF, B:FF */
/* 4 */ store_word(R_AT, R_T7, 4),
/* 5 */ enc_gte_sw(C2_SXY0, R_T7, 8),
/* 6 */ enc_gte_sw(C2_SXY1, R_T7, 12),
/* 7 */ enc_gte_sw(C2_SXY2, R_T7, 16),
/* 6. OT Insertion with Bounds Checking */
/* 8 */ nop,
/* 9 */ nop,
/* 10 */ enc_gte_cmd(0x2D), /* AVSZ3 */
/* 11 */ nop, /* Wait for AVSZ3 */
/* 12 */ nop, /* Wait for AVSZ3 */
/* 13 */ gte_mf(R_T1, C2_OTZ), /* T1 = Depth index */
/* Bounds Check: OTZ < 2048 */
/* 14 */ load_ui(R_AT, 2048),
/* 15 */ slt_u(R_AT, R_T1, R_AT), /* AT = (OTZ < 2048) ? 1 : 0 */
/* 16 */ branch_equal(R_AT, R_0, 11), /* If AT == 0, skip to end (11 instrs past delay) */
/* 17 */ nop, /* <--- DELAY SLOT (Index 0 for Bounds branch) */
/* 18 (1) */ shift_ll(R_T1, R_T1, 2),
/* 19 (2) */ add_u(R_T1, R_T1, R_T6), /* T1 = &OrderingTable[OTZ] */
/* 20 (3) */ load_word(R_AT, R_T1, 0), /* AT = current head */
/* 21 (4) */ store_word(R_AT, R_T7, 0), /* prim->next = head */
/* Create Tag in AT: Len 4 (0x04) in top 8 bits, T7 in bottom 24 */
/* 22 (5) */ shift_ll(R_AT, R_T7, 8),
/* 23 (6) */ shift_lr(R_AT, R_AT, 8), /* AT = T7 & 0x00FFFFFF */
/* 24 (7) */ load_ui(R_V0, 0x0400), /* V0 = 0x04000000 */
/* 25 (8) */ or_u(R_AT, R_AT, R_V0), /* AT = Tag */
/* 26 (9) */ store_word(R_AT, R_T1, 0), /* OrderingTable[OTZ] = Tag */
/* 27 (10) */ add_ui(R_T7, R_T7, 20), /* Advance Prim Cursor (5 words) */
/* 7. Yield */
/* 28 (11) */ add_ui(R_T4, R_T4, 8), /* Advance Face Cursor (4 * S2 = 8 bytes) */
mips_yield
};
+13 -7
View File
@@ -246,6 +246,13 @@ enum { _BitOffsets = 0
#define xor_i(rt, rs, imm) enc_i(op_xori, (rs), (rt), (imm))
#define load_ui(rt, imm) enc_i(op_lui, R_0, (rt), (imm))
/* Logic Opcodes */
#define and_u(rd, rs, rt) enc_r(op_special, (rs), (rt), (rd), 0, fc_and)
#define or_u(rd, rs, rt) enc_r(op_special, (rs), (rt), (rd), 0, fc_or)
#define xor_u(rd, rs, rt) enc_r(op_special, (rs), (rt), (rd), 0, fc_xor)
#define nor_u(rd, rs, rt) enc_r(op_special, (rs), (rt), (rd), 0, fc_nor)
/* Shift family (R-type). shift_ll/lr/ra: `sll rd, rt, shamt` */
#define shift_ll(rd, rt, shamt) enc_r(op_special, R_0, (rt), (rd), (shamt), fc_sll)
#define shift_lr(rd, rt, shamt) enc_r(op_special, R_0, (rt), (rd), (shamt), fc_srl)
@@ -321,12 +328,13 @@ enum { _BitOffsets = 0
* branch_le_zero rs, off → blez rs, off
* branch_ge_zero rs, off → bgez rs, off
* (For `bgez`, the opcode is `op_bcond` with rt=1 to invert the bltz condition.) */
#define branch_equal(rs, rt, off) enc_i(op_beq, (rs), (rt), (off))
#define branch_ne(rs, rt, off) enc_i(op_bne, (rs), (rt), (off))
#define branch_lt_zero(rs, off) enc_i(op_bltz, R_0, (rs), (off))
#define branch_gt_zero(rs, off) enc_i(op_bgtz, R_0, (rs), (off))
#define branch_le_zero(rs, off) enc_i(op_blez, R_0, (rs), (off))
#define branch_ge_zero(rs, off) enc_i(op_bcond, R_0, (rs), (1u << 16) | ((off) & 0xFFFF))
#define branch_lt_zero(rs, off) enc_i(op_bcond, (rs), R_0, (off)) /* bltz is bcond with rt=0 */
#define branch_ge_zero(rs, off) enc_i(op_bcond, (rs), 1, (off)) /* bgez is bcond with rt=1 */
#define branch_le_zero(rs, off) enc_i(op_blez, (rs), R_0, (off)) /* blez has its own opcode, rt=0 */
#define branch_gt_zero(rs, off) enc_i(op_bgtz, (rs), R_0, (off)) /* bgtz has its own opcode, rt=0 */
/* --- System (kernel) instructions --- */
#define syscall() enc_r(op_special, R_0, R_0, R_0, 0, fc_syscall)
@@ -490,7 +498,7 @@ Code CodeBlob_(mips_flush_icache) {
, jump_reg(rret_addr) /* jr $ra */
, add_ui(rstack_ptr, rstack_ptr, 8) /* sp += 8 (BD) */
};
FI_ void mips_flush_icache(void) { C_(VoidFn*, code_mips_flush_icache)(); }
I_ void mips_flush_icache(void) { C_(VoidFn*, code_mips_flush_icache)(); }
/* Standard clobber list for pure-MIPS asm volatile blocks: caller-saved
* GPRs that the kernel treats as volatile (v0/v1/t0/t1/ra) plus the
@@ -513,5 +521,3 @@ FI_ void mips_flush_icache(void) { C_(VoidFn*, code_mips_flush_icache)(); }
void test_mips_asm() {
asm_mips_flush_icache();
}
// TAPE & EMITTERS
+36
View File
@@ -12,6 +12,7 @@
#include "duffle/mips.h"
#include "duffle/gp.h"
#include "duffle/gte.h"
#include "duffle/lottes_tape.h"
#include "hello_gte.h"
enum {
@@ -229,6 +230,7 @@ void update(PrimitiveArena* pa, U4* ordering_buf)
static_mem.cube.rot.y += 30;
}
// Draw Floor
if (0)
{
m3s2_rotation (& static_mem.floor.rot, & static_mem.tform_world);
m3s2_translation(& static_mem.tform_world, & static_mem.floor.pos);
@@ -282,6 +284,40 @@ void update(PrimitiveArena* pa, U4* ordering_buf)
}
static_mem.floor.rot.y += 5;
}
// Draw floor tape method
if (1)
{
LP_ U4 mem_temp_tape[512]; // Buffer for function addresses
FArena tape_arena;
farena_init(&tape_arena, slice_ut(mem_temp_tape, S_(mem_temp_tape)));
TapeBuilder tb = tb_begin(&tape_arena); {
// Setup state atoms
m3s2_rotation(&static_mem.floor.rot, &static_mem.tform_world);
m3s2_translation(&static_mem.tform_world, &static_mem.floor.pos);
// Push "Protocol" to tape
tb_emit(&tb, code_atom_set_gte_world);
tb_emit(&tb, (Code*)&static_mem.tform_world);
for (U4 i = 0; i < Floor_num_faces; i++) {
tb_emit(&tb, code_atom_floor_tri);
}
}
Slice_U4 tape = tb_end(&tb);
// 1. Setup Argument Registers (The Workspace)
register V3_S2* face_cursor rgcc(R_T4) = static_mem.floor.faces;
register V3_S2* vert_base rgcc(R_T5) = static_mem.floor.verts;
register U4* ot_base rgcc(R_T6) = ordering_buf;
// --- EXECUTION ---
B1* prim_cursor = (B1*)r_(pa->buf)[static_mem.active_buf_id] + pa->used;
// 2. Fire the Tape Drive (Explicitly bind the workspace variables)
tape_run(tape, &prim_cursor, static_mem.floor.faces, static_mem.floor.verts, ordering_buf);
// 3. Update C-side state
pa->used = (U4)prim_cursor - (U4)r_(pa->buf)[static_mem.active_buf_id];
static_mem.floor.rot.y += 5;
}
}
int main(void)