Finish chapter: The GTE Coprocessor.

2026-06-01 18:41:13 -07:00 · 2026-03-28 12:51:29 -04:00
parent b846c697c6
commit d1919137a3
8 changed files with 135 additions and 51 deletions
@@ -1,8 +1,8 @@
 build
 toolchain/armips
 toolchain/pcsx-redux
-toolchain/psyq_iwyu
-toolchain/PSn00bSDK
+# toolchain/psyq_iwyu
+# toolchain/PSn00bSDK

 *.exe
 *.elf
@@ -14,7 +14,7 @@
                "INTELLISENSE_DIRECTIVES"
            ],
            "intelliSenseMode": "gcc-x86",
-            "compilerPath": "C:\\Users\\Ed\\AppData\\Roaming\\mips\\versions\\v14.2.0\\bin\\mipsel-none-elf-gcc.exe",
+            "compilerPath": "C:/Users/Ed/scoop/apps/gcc/current/bin/gcc.exe"
        }
    ],
    "version": 4
@@ -3,10 +3,15 @@
 #	include "assert.h"
 #endif

+#define LP_      static // local_persist
+#define internal static // internal
+#define global
+#define gknown
+
 #define align_(value) __attribute__((aligned (value)))             // for easy alignment
 #define expect_(x, y) __builtin_expect(x, y)                       // so compiler knows the common path
-#define finline           static inline __attribute__((always_inline)) // force inline
-#define no_inline         static        __attribute__((noinline))      // force no inline [used in thread api]
+#define FI_           static inline __attribute__((always_inline)) // force inline
+#define NI_           static        __attribute__((noinline))      // force no inline [used in thread api]
 #define R_            __restrict                                   // pointers are either restricted or volatile and nothing else 
 #define V_            volatile                                     // pointers are either restricted or volatile and nothing else

@@ -16,12 +21,7 @@
 #define stringify(S)             stringify_impl(S)
 #define tmpl(prefix, type)       prefix ## _ ## type

-#define local_persist            static
-#define internal                 static
-#define global
-#define gknown
-
-#define offset_of(type, member)  cast(SSIZE, & (((type*) 0)->member))
+#define offset_of(type, member)  cast(U8,__builtin_offsetof(type,member))
 #define static_assert            _Static_assert
 #define typeof                   __typeof__
 #define typeof_ptr(ptr)          typeof((ptr)[0])
@@ -32,9 +32,16 @@
 #define def_ptr_set(type)        def_R_(type); typedef def_V_(type)
 #define def_tset(type)           type; typedef def_ptr_set(type)

-typedef __UINT8_TYPE__  def_tset(U1); typedef __UINT16_TYPE__ def_tset(U2); typedef __UINT32_TYPE__ def_tset(U4);
-typedef __INT8_TYPE__   def_tset(S1); typedef __INT16_TYPE__  def_tset(S2); typedef __INT32_TYPE__  def_tset(S4);
-typedef unsigned char   def_tset(B1); typedef __UINT16_TYPE__ def_tset(B2); typedef __UINT32_TYPE__ def_tset(B4);
+typedef __UINT8_TYPE__  def_tset(U1); 
+typedef __UINT16_TYPE__ def_tset(U2);
+typedef __UINT32_TYPE__ def_tset(U4);
+typedef __INT8_TYPE__   def_tset(S1); 
+typedef __INT16_TYPE__  def_tset(S2); 
+typedef __INT32_TYPE__  def_tset(S4);
+typedef unsigned char   def_tset(B1); 
+typedef __UINT16_TYPE__ def_tset(B2); 
+typedef __UINT32_TYPE__ def_tset(B4);
+typedef __UINT64_TYPE__ def_tset(B8);
 enum { false = 0, true  = 1, true_overflow, };

 #define u1_r(value) cast(U1_R, value)
@@ -75,6 +82,8 @@ enum { false = 0, true  = 1, true_overflow, };

 #define r_(ptr)                             cast(typeof_ptr(ptr)*R_, ptr)
 #define v_(ptr)                             cast(typeof_ptr(ptr)*V_, ptr)
+#define tr_(type, ptr)                      cast(type*R_, ptr)
+#define tv_(type, ptr)                      cast(type*V_, ptr)

 #define kilo(n)                             (cast(U4, n) << 10)
 #define mega(n)                             (cast(U4, n) << 20)
@@ -87,7 +96,7 @@ enum { false = 0, true  = 1, true_overflow, };
 #define sop_2(op, a, b) cast(U2, s2_(a) op s2_(b))
 #define sop_4(op, a, b) cast(U4, s4_(a) op s4_(b))

-#define def_signed_op(id, op, width) finline U ## width id ## _s ## width(U ## width a, U ## width b) {return sop_ ## width(op, a, b); }
+#define def_signed_op(id, op, width) FI_ U ## width id ## _s ## width(U ## width a, U ## width b) {return sop_ ## width(op, a, b); }
 #define def_signed_ops(id, op)       def_signed_op(id, op, 1) def_signed_op(id, op, 2) def_signed_op(id, op, 4)
 def_signed_ops(add, +) def_signed_ops(sub, -)
 def_signed_ops(mut, *) def_signed_ops(div, /)
@@ -103,13 +112,13 @@ def_signed_ops(ge, >=) def_signed_ops(le, <=)
 #define ge_s(a,b)  def_generic_sop(ge, a,b)
 #define le_s(a,b)  def_generic_sop(le, a,b)

-#define span_iter(type, iter, m_begin, op, m_end)  \
+#define span_iter(type, iter, m_begin, op, m_end) ( \
 	tmpl(Iter_Span,type) iter = {     \
 		.r      = {(m_begin), (m_end)}, \
 		.cursor = (m_begin) };          \
 	iter.cursor op iter.r.end;        \
-	++ iter.cursor
-
+	++ iter.cursor                    \
+)
 #define def_span(type)                                                \
 	        def_struct(tmpl(     Span,type)) { type begin; type end; }; \
 	typedef def_struct(tmpl(Iter_Span,type)) { tmpl(Span,type) r; type cursor; }
@@ -127,7 +136,7 @@ typedef def_struct(Slice_Str8) { Str8* ptr; U4 len; };
 #define def_Slice(type)           def_struct(tmpl(Slice,type)) { type* ptr; U4 len; }
 #define slice_assert(slice)       do { assert((slice).ptr != nullptr); assert((slice).len > 0); } while(0)
 #define slice_end(slice)          ((slice).ptr + (slice).len)
-#define size_of_slice_type(slice) size_of( * (slice).ptr )
+#define size_of_slice_type(slice) size_of((slice).ptr[0])

 typedef def_Slice(void);
 typedef def_Slice(B1);
@@ -142,10 +151,11 @@ void slice__zero(Slice_B1 mem, U4 typewidth);
 } while (0)
 #define slice_zero(slice) slice__zero(slice_byte(slice), size_of_slice_type(slice))

-#define slice_iter(container, iter)               \
+#define slice_iter(container, iter) (             \
 	typeof((container).ptr) iter = (container).ptr; \
 	iter != slice_end(container);                   \
-	++ iter
+	++ iter                                         \
+)
 #define slice_from_farray(type, ...) & (tmpl(Slice,type)) { \
 	.ptr = farray_init(type, __VA_ARGS__),             \
 	.len = farray_len( farray_init(type, __VA_ARGS__)) \
@@ -16,10 +16,14 @@ typedef S2 A3A3_S2[3][3];

 typedef def_struct(Extent2_S2) { S2 width; S2 height; };
 typedef def_struct(Extent2_S4) { S4 width; S4 height; };
+
 typedef def_struct(V2_S2)      { S2 x; S2 y; };
 typedef def_struct(V2_S4)      { S4 x; S4 y; };
 typedef def_struct(V3_S2)      { S2 x; S2 y; S2 z; S2 pad; };
 typedef def_struct(V3_S4)      { S4 x; S4 y; S4 z; S4 pad; };
+typedef def_struct(V4_S2)      { S2 x; S2 y; S2 z; S2 w; };
+typedef def_struct(V4_S4)      { S4 x; S4 y; S4 z; S4 w; };
+
 typedef def_struct(R2_S2)      { V2_S2 p0; V2_S2 p1; };
 typedef def_struct(R2_S4)      { V2_S4 p0; V2_S4 p1; };

@@ -31,3 +35,5 @@ typedef def_struct(M3_S2) { A3A3_S2 m; A3_S4 t; };
 #define v2s2(x,y)     (V2_S2){x,y}
 #define v3s2(x,y,z)   (V3_S2){x,y,z}
 #define v3s4(x,y,z)   (V3_S4){x,y,z}
+#define v4s2(x,y,z,w) (V4_S2){x,y,z,w}
+#define v4s4(x,y,z,w) (V4_S4){x,y,z,w}
@@ -94,8 +94,7 @@ struct AllocatorProc_Out {
 	U4                  left; // Contiguous memory left
 	U4                  max_alloc;
 	U4                  min_alloc;
-	B4                  continuity_break; // Whether this allocation broke continuity with the previous (address space wise)
-	byte_pad(4);
+	// byte_pad(8);
 };
 typedef def_struct(AllocatorInfo) {
 	AllocatorProc* proc;
@@ -108,8 +107,7 @@ typedef def_struct(AllocatorQueryInfo) {
 	U4                  left; // Contiguous memory left
 	U4                  max_alloc;
 	U4                  min_alloc;
-	B4                  continuity_break; // Whether this allocation broke continuity with the previous (address space wise)
-	byte_pad(4);
+	// byte_pad(4);
 };
 static_assert(size_of(AllocatorProc_Out) == size_of(AllocatorQueryInfo));

@@ -11,6 +11,8 @@
 #include "duffle/gp.h"
 #include "hello_gpu.h"

+#define GTE_Coprocessor_Chapter 1
+
 typedef def_farray(V2_S2, 3);
 typedef def_struct(Poly_F3) {
 	U4   tag;
@@ -73,12 +75,22 @@ typedef def_struct(PrimitiveArena) {
 	U4                 used;
 };

-#define Cube_num_verts 8
-#define Cube_num_faces 12
-typedef def_farray(V3_S2, Cube_num_verts);
-typedef def_farray(V3_S2, Cube_num_faces);
+#define GTE_Coprocessor_UseQuads 1
+#define GTE_Coprocessor_UseTris  0

-void cube128_init(A8_V3_S2* verts, A12_V3_S2* faces) {
+#define Cube_num_verts 8
+typedef def_farray(V3_S2, Cube_num_verts);
+#if GTE_Coprocessor_UseTris
+#define Cube_num_faces 12
+typedef def_farray(V3_S2, Cube_num_faces)
+typedef A12_V3_S2 ACubeFaces;
+#endif
+#if GTE_Coprocessor_UseQuads
+#define Cube_num_faces 6
+typedef def_farray(V4_S2, Cube_num_faces);
+typedef A6_V4_S2 ACubeFaces;
+#endif
+void cube128_init(A8_V3_S2* verts, ACubeFaces* faces) {
 	memory_copy(verts, & (A8_V3_S2) {
 			{ -128, -128, -128 },
 			{  128, -128, -128 },
@@ -91,6 +103,7 @@ void cube128_init(A8_V3_S2* verts, A12_V3_S2* faces) {
 		},
 		size_of(A8_V3_S2)
 	);
+	#if GTE_Coprocessor_UseTris
 	memory_copy(faces, & (A12_V3_S2) {
 			{ 0, 3, 2 }, // top
 			{ 0, 2, 1 }, // top
@@ -107,6 +120,19 @@ void cube128_init(A8_V3_S2* verts, A12_V3_S2* faces) {
 		},
 		size_of(A12_V3_S2)
 	);
+	#endif
+	#if GTE_Coprocessor_UseQuads
+	memory_copy(faces, & (A6_V4_S2) {
+			{ 3, 2, 0, 1 },
+			{ 0, 1, 4, 5 },
+			{ 4, 5, 7, 6 },
+			{ 1, 2, 5, 6 },
+			{ 2, 3, 6, 7 },
+			{ 3, 0, 7, 4 },
+		},
+		sizeof(A6_V4_S2)
+	);
+	#endif
 	return;
 }

@@ -123,7 +149,7 @@ typedef def_struct(SMemory) {
 	M3_S2 tform_world;

 	A8_V3_S2   cube_verts;
-	A12_V3_S2 cube_faces;
+	ACubeFaces cube_faces;
 };
 global SMemory static_mem;
 extern SMemory static_mem;
@@ -195,7 +221,7 @@ void update(PrimitiveArena* pa, U4* ordering_buf)
 	gte_matrix_set_rotation   (& static_mem.tform_world);
 	gte_matrix_set_translation(& static_mem.tform_world);

-#if 1
+#if GTE_Coprocessor_Chapter
 	S4 nclip = 0;
 	S4 orderingtbl_z = 0;
 	A2_S2 p;    //???
@@ -203,6 +229,7 @@ void update(PrimitiveArena* pa, U4* ordering_buf)

 	for (U4 face_id = 0; face_id < Cube_num_faces; face_id += 1)
 	{
+		#if GTE_Coprocessor_UseTris
 		Poly_G3* tri = prim_alloc(Poly_G3); set_poly_g3(tri);
 		tri->c0 = rgb8(255,   0, 255);
 		tri->c1 = rgb8(255, 255,   0);
@@ -231,8 +258,34 @@ void update(PrimitiveArena* pa, U4* ordering_buf)
 		if ((orderingtbl_z > 0) && (orderingtbl_z < OrderingTbl_Len)) {
 			orderingtbl_add_primitive(ordering_buf[orderingtbl_z], tri);
 		}
+		#endif
+		#if GTE_Coprocessor_UseQuads
+		Poly_G4* quad = prim_alloc(Poly_G4); set_poly_g4(quad);
+		quad->c0 = rgb8(255,   0, 255);
+		quad->c1 = rgb8(255, 255,   0);
+		quad->c2 = rgb8(  0, 255, 255);
+		quad->c3 = rgb8(  0, 255,   0);
+
+		V4_S2* face = & static_mem.cube_faces[face_id];
+		V3_S2* p0   = & static_mem.cube_verts[face->x];
+		V3_S2* p1   = & static_mem.cube_verts[face->y];
+		V3_S2* p2   = & static_mem.cube_verts[face->z];
+		V3_S2* p3   = & static_mem.cube_verts[face->w];
+
+		nclip = rtp_avg_nclip_a4_v3s2(
+			p0, p1, p2, p3,
+			& quad->p0, & quad->p1, & quad->p2, & quad->p3,
+			& p, & orderingtbl_z, & flag
+		);
+		if (nclip <= 0) {
+			continue;
 		}

+		if ((orderingtbl_z > 0) && (orderingtbl_z < OrderingTbl_Len)) {
+			orderingtbl_add_primitive(ordering_buf[orderingtbl_z], quad);
+		}
+		#endif
+	}
 	static_mem.rotation.x +=  6;
 	static_mem.rotation.y +=  8;
 	static_mem.rotation.z += 12;
@@ -117,10 +117,10 @@ M3_S2* m3s2_scale      (M3_S2* mat, V3_S4* vec) __asm__("ScaleMatrix");
 // Rotation, Translation, Perspective

 S4 rtp_v3s2_raw(V3_S2* vec, S4* xy, S4* pp, S4* flag) __asm__("RotTransPers");
-finline S4 rtp_v3s2(V3_S2* vec, V2_S2* xy, A2_S2* pp, S4* flag) { return rtp_v3s2_raw(vec, cast(S4*R_, & xy->x), cast(S4*R_, pp), r_(flag)); }
+FI_ S4 rtp_v3s2(V3_S2* vec, V2_S2* xy, A2_S2* pp, S4* flag) { return rtp_v3s2_raw(vec, cast(S4*R_, & xy->x), cast(S4*R_, pp), r_(flag)); }

 S4 rtp_avg_nclip_a3_v3s2_raw(V3_S2* v0, V3_S2* v1, V3_S2* v2, S4* xy1, S4* xy2, S4* xy3, S4* pp, S4* otz, S4* flag) __asm__("RotAverageNclip3");
-finline  S4 rtp_avg_nclip_a3_v3s2(
+FI_  S4 rtp_avg_nclip_a3_v3s2(
 	V3_S2* v0,  V3_S2* v1,  V3_S2* v2, 
 	V2_S2* xy0, V2_S2* xy1, V2_S2* xy2, 
 	A2_S2* pp, S4* otz, S4* flag
@@ -132,6 +132,19 @@ finline  S4 rtp_avg_nclip_a3_v3s2(
 	);
 }

+S4 rtp_avg_nclip_a4_v3s2_raw(V3_S2* v0, V3_S2* v1, V3_S2* v2, V3_S2* v3, S4* xy1, S4* xy2, S4* xy3, S4* xy4, S4* pp, S4* otz, S4* flag) __asm__("RotAverageNclip4");
+FI_ S4 rtp_avg_nclip_a4_v3s2(
+	V3_S2* v0,  V3_S2* v1,  V3_S2* v2,  V3_S2* v3,
+	V2_S2* xy0, V2_S2* xy1, V2_S2* xy2, V2_S2* xy3,
+	A2_S2* pp,  S4* otz,    S4* flag
+){
+	return rtp_avg_nclip_a4_v3s2_raw(
+		v0, v1, v2, v3,
+		cast(S4*R_, xy0), cast(S4*R_, xy1), cast(S4*R_, xy2), cast(S4*R_, xy3),
+		cast(S4*R_, pp),  cast(S4*R_, otz), cast(S4*R_, flag)
+	);
+}
+
 void gte_matrix_set_rotation   (M3_S2* mat) __asm__("SetRotMatrix");
 void gte_matrix_set_translation(M3_S2* mat) __asm__("SetTransMatrix");

@@ -30,8 +30,10 @@ $f_wall             = "-Wall"
 $f_wno_attributes   = "-Wno-attributes"

 # Optimization Flags
-$f_optimize_none    = "-O0" # For Debug builds
-$f_optimize_size    = "-Os" # For Release builds
+$f_optimize_none       = "-O0"
+$f_optimize_size       = "-Os"
+$f_optimize_intrinsics = "-Oi"
+$f_optimize_debug      = "-Og"
 $f_omit_frame_ptr      = "-fomit-frame-pointer"

 # Environment & Standard Library Flags
@@ -289,8 +291,10 @@ function build-graphis_hello {

 	$compile_args = @()
 	$compile_args += $f_debug
-	$compile_args += $f_optimize_none
+	# $compile_args += $f_optimize_none
+	# $compile_args += $f_optimize_intrinsics
 	# $compile_args += $f_optimize_size
+	$compile_args += $f_optimize_debug
 	$compile_args += ($f_include + $path_code)
 	compile-unit $src_c $module_c $includes $compile_args