diff --git a/.gitignore b/.gitignore
index 0411a49..538ee20 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,8 @@
 ﻿build
 toolchain/armips
 toolchain/pcsx-redux
-toolchain/psyq_iwyu
-toolchain/PSn00bSDK
+# toolchain/psyq_iwyu
+# toolchain/PSn00bSDK
 
 *.exe
 *.elf
diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json
index cc2d581..28d863a 100644
--- a/.vscode/c_cpp_properties.json
+++ b/.vscode/c_cpp_properties.json
@@ -14,7 +14,7 @@
                 "INTELLISENSE_DIRECTIVES"
             ],
             "intelliSenseMode": "gcc-x86",
-            "compilerPath": "C:\\Users\\Ed\\AppData\\Roaming\\mips\\versions\\v14.2.0\\bin\\mipsel-none-elf-gcc.exe",
+            "compilerPath": "C:/Users/Ed/scoop/apps/gcc/current/bin/gcc.exe"
         }
     ],
     "version": 4
diff --git a/code/duffle/dsl.h b/code/duffle/dsl.h
index c4c6b05..b0e4ba0 100644
--- a/code/duffle/dsl.h
+++ b/code/duffle/dsl.h
@@ -3,12 +3,17 @@
 #	include "assert.h"
 #endif
 
-#define align_(value)     __attribute__((aligned (value)))             // for easy alignment
-#define expect_(x, y)     __builtin_expect(x, y)                       // so compiler knows the common path
-#define finline           static inline __attribute__((always_inline)) // force inline
-#define no_inline         static        __attribute__((noinline))      // force no inline [used in thread api]
-#define R_                __restrict                                   // pointers are either restricted or volatile and nothing else 
-#define V_                volatile                                     // pointers are either restricted or volatile and nothing else
+#define LP_      static // local_persist
+#define internal static // internal
+#define global
+#define gknown
+
+#define align_(value) __attribute__((aligned (value)))             // for easy alignment
+#define expect_(x, y) __builtin_expect(x, y)                       // so compiler knows the common path
+#define FI_           static inline __attribute__((always_inline)) // force inline
+#define NI_           static        __attribute__((noinline))      // force no inline [used in thread api]
+#define R_            __restrict                                   // pointers are either restricted or volatile and nothing else 
+#define V_            volatile                                     // pointers are either restricted or volatile and nothing else
 
 #define glue_impl(A, B)          A ## B
 #define glue(A, B)               glue_impl(A, B)
@@ -16,12 +21,7 @@
 #define stringify(S)             stringify_impl(S)
 #define tmpl(prefix, type)       prefix ## _ ## type
 
-#define local_persist            static
-#define internal                 static
-#define global
-#define gknown
-
-#define offset_of(type, member)  cast(SSIZE, & (((type*) 0)->member))
+#define offset_of(type, member)  cast(U8,__builtin_offsetof(type,member))
 #define static_assert            _Static_assert
 #define typeof                   __typeof__
 #define typeof_ptr(ptr)          typeof((ptr)[0])
@@ -32,9 +32,16 @@
 #define def_ptr_set(type)        def_R_(type); typedef def_V_(type)
 #define def_tset(type)           type; typedef def_ptr_set(type)
 
-typedef __UINT8_TYPE__  def_tset(U1); typedef __UINT16_TYPE__ def_tset(U2); typedef __UINT32_TYPE__ def_tset(U4);
-typedef __INT8_TYPE__   def_tset(S1); typedef __INT16_TYPE__  def_tset(S2); typedef __INT32_TYPE__  def_tset(S4);
-typedef unsigned char   def_tset(B1); typedef __UINT16_TYPE__ def_tset(B2); typedef __UINT32_TYPE__ def_tset(B4);
+typedef __UINT8_TYPE__  def_tset(U1); 
+typedef __UINT16_TYPE__ def_tset(U2);
+typedef __UINT32_TYPE__ def_tset(U4);
+typedef __INT8_TYPE__   def_tset(S1); 
+typedef __INT16_TYPE__  def_tset(S2); 
+typedef __INT32_TYPE__  def_tset(S4);
+typedef unsigned char   def_tset(B1); 
+typedef __UINT16_TYPE__ def_tset(B2); 
+typedef __UINT32_TYPE__ def_tset(B4);
+typedef __UINT64_TYPE__ def_tset(B8);
 enum { false = 0, true  = 1, true_overflow, };
 
 #define u1_r(value) cast(U1_R, value)
@@ -75,6 +82,8 @@ enum { false = 0, true  = 1, true_overflow, };
 
 #define r_(ptr)                             cast(typeof_ptr(ptr)*R_, ptr)
 #define v_(ptr)                             cast(typeof_ptr(ptr)*V_, ptr)
+#define tr_(type, ptr)                      cast(type*R_, ptr)
+#define tv_(type, ptr)                      cast(type*V_, ptr)
 
 #define kilo(n)                             (cast(U4, n) << 10)
 #define mega(n)                             (cast(U4, n) << 20)
@@ -87,7 +96,7 @@ enum { false = 0, true  = 1, true_overflow, };
 #define sop_2(op, a, b) cast(U2, s2_(a) op s2_(b))
 #define sop_4(op, a, b) cast(U4, s4_(a) op s4_(b))
 
-#define def_signed_op(id, op, width) finline U ## width id ## _s ## width(U ## width a, U ## width b) {return sop_ ## width(op, a, b); }
+#define def_signed_op(id, op, width) FI_ U ## width id ## _s ## width(U ## width a, U ## width b) {return sop_ ## width(op, a, b); }
 #define def_signed_ops(id, op)       def_signed_op(id, op, 1) def_signed_op(id, op, 2) def_signed_op(id, op, 4)
 def_signed_ops(add, +) def_signed_ops(sub, -)
 def_signed_ops(mut, *) def_signed_ops(div, /)
@@ -103,13 +112,13 @@ def_signed_ops(ge, >=) def_signed_ops(le, <=)
 #define ge_s(a,b)  def_generic_sop(ge, a,b)
 #define le_s(a,b)  def_generic_sop(le, a,b)
 
-#define span_iter(type, iter, m_begin, op, m_end)  \
-	tmpl(Iter_Span,type) iter = { \
-		.r = {(m_begin), (m_end)},  \
-		.cursor = (m_begin) };      \
-	iter.cursor op iter.r.end;    \
-	++ iter.cursor
-
+#define span_iter(type, iter, m_begin, op, m_end) ( \
+	tmpl(Iter_Span,type) iter = {     \
+		.r      = {(m_begin), (m_end)}, \
+		.cursor = (m_begin) };          \
+	iter.cursor op iter.r.end;        \
+	++ iter.cursor                    \
+)
 #define def_span(type)                                                \
 	        def_struct(tmpl(     Span,type)) { type begin; type end; }; \
 	typedef def_struct(tmpl(Iter_Span,type)) { tmpl(Span,type) r; type cursor; }
@@ -127,7 +136,7 @@ typedef def_struct(Slice_Str8) { Str8* ptr; U4 len; };
 #define def_Slice(type)           def_struct(tmpl(Slice,type)) { type* ptr; U4 len; }
 #define slice_assert(slice)       do { assert((slice).ptr != nullptr); assert((slice).len > 0); } while(0)
 #define slice_end(slice)          ((slice).ptr + (slice).len)
-#define size_of_slice_type(slice) size_of( * (slice).ptr )
+#define size_of_slice_type(slice) size_of((slice).ptr[0])
 
 typedef def_Slice(void);
 typedef def_Slice(B1);
@@ -142,10 +151,11 @@ void slice__zero(Slice_B1 mem, U4 typewidth);
 } while (0)
 #define slice_zero(slice) slice__zero(slice_byte(slice), size_of_slice_type(slice))
 
-#define slice_iter(container, iter)               \
+#define slice_iter(container, iter) (             \
 	typeof((container).ptr) iter = (container).ptr; \
 	iter != slice_end(container);                   \
-	++ iter
+	++ iter                                         \
+)
 #define slice_from_farray(type, ...) & (tmpl(Slice,type)) { \
 	.ptr = farray_init(type, __VA_ARGS__),             \
 	.len = farray_len( farray_init(type, __VA_ARGS__)) \
diff --git a/code/duffle/math.h b/code/duffle/math.h
index 0238ea8..2703776 100644
--- a/code/duffle/math.h
+++ b/code/duffle/math.h
@@ -16,10 +16,14 @@ typedef S2 A3A3_S2[3][3];
 
 typedef def_struct(Extent2_S2) { S2 width; S2 height; };
 typedef def_struct(Extent2_S4) { S4 width; S4 height; };
+
 typedef def_struct(V2_S2)      { S2 x; S2 y; };
 typedef def_struct(V2_S4)      { S4 x; S4 y; };
 typedef def_struct(V3_S2)      { S2 x; S2 y; S2 z; S2 pad; };
 typedef def_struct(V3_S4)      { S4 x; S4 y; S4 z; S4 pad; };
+typedef def_struct(V4_S2)      { S2 x; S2 y; S2 z; S2 w; };
+typedef def_struct(V4_S4)      { S4 x; S4 y; S4 z; S4 w; };
+
 typedef def_struct(R2_S2)      { V2_S2 p0; V2_S2 p1; };
 typedef def_struct(R2_S4)      { V2_S4 p0; V2_S4 p1; };
 
@@ -28,6 +32,8 @@ typedef def_struct(Rect_S4) { S4 x; S4 y; S4 width; S4 height; };
 
 typedef def_struct(M3_S2) { A3A3_S2 m; A3_S4 t; };
 
-#define v2s2(x,y)   (V2_S2){x,y}
-#define v3s2(x,y,z) (V3_S2){x,y,z}
-#define v3s4(x,y,z) (V3_S4){x,y,z}
+#define v2s2(x,y)     (V2_S2){x,y}
+#define v3s2(x,y,z)   (V3_S2){x,y,z}
+#define v3s4(x,y,z)   (V3_S4){x,y,z}
+#define v4s2(x,y,z,w) (V4_S2){x,y,z,w}
+#define v4s4(x,y,z,w) (V4_S4){x,y,z,w}
diff --git a/code/duffle/memory.h b/code/duffle/memory.h
index 27dcc38..76d1268 100644
--- a/code/duffle/memory.h
+++ b/code/duffle/memory.h
@@ -94,8 +94,7 @@ struct AllocatorProc_Out {
 	U4                  left; // Contiguous memory left
 	U4                  max_alloc;
 	U4                  min_alloc;
-	B4                  continuity_break; // Whether this allocation broke continuity with the previous (address space wise)
-	byte_pad(4);
+	// byte_pad(8);
 };
 typedef def_struct(AllocatorInfo) {
 	AllocatorProc* proc;
@@ -108,8 +107,7 @@ typedef def_struct(AllocatorQueryInfo) {
 	U4                  left; // Contiguous memory left
 	U4                  max_alloc;
 	U4                  min_alloc;
-	B4                  continuity_break; // Whether this allocation broke continuity with the previous (address space wise)
-	byte_pad(4);
+	// byte_pad(4);
 };
 static_assert(size_of(AllocatorProc_Out) == size_of(AllocatorQueryInfo));
 
diff --git a/code/graphics_hello_psyq/hello_gpu.c b/code/graphics_hello_psyq/hello_gpu.c
index e8e5538..d4d684f 100644
--- a/code/graphics_hello_psyq/hello_gpu.c
+++ b/code/graphics_hello_psyq/hello_gpu.c
@@ -11,6 +11,8 @@
 #include "duffle/gp.h"
 #include "hello_gpu.h"
 
+#define GTE_Coprocessor_Chapter 1
+
 typedef def_farray(V2_S2, 3);
 typedef def_struct(Poly_F3) {
 	U4   tag;
@@ -73,12 +75,22 @@ typedef def_struct(PrimitiveArena) {
 	U4                 used;
 };
 
-#define Cube_num_verts 8
-#define Cube_num_faces 12
-typedef def_farray(V3_S2, Cube_num_verts);
-typedef def_farray(V3_S2, Cube_num_faces);
+#define GTE_Coprocessor_UseQuads 1
+#define GTE_Coprocessor_UseTris  0
 
-void cube128_init(A8_V3_S2* verts, A12_V3_S2* faces) {
+#define Cube_num_verts 8
+typedef def_farray(V3_S2, Cube_num_verts);
+#if GTE_Coprocessor_UseTris
+#define Cube_num_faces 12
+typedef def_farray(V3_S2, Cube_num_faces)
+typedef A12_V3_S2 ACubeFaces;
+#endif
+#if GTE_Coprocessor_UseQuads
+#define Cube_num_faces 6
+typedef def_farray(V4_S2, Cube_num_faces);
+typedef A6_V4_S2 ACubeFaces;
+#endif
+void cube128_init(A8_V3_S2* verts, ACubeFaces* faces) {
 	memory_copy(verts, & (A8_V3_S2) {
 			{ -128, -128, -128 },
 			{  128, -128, -128 },
@@ -91,6 +103,7 @@ void cube128_init(A8_V3_S2* verts, A12_V3_S2* faces) {
 		},
 		size_of(A8_V3_S2)
 	);
+	#if GTE_Coprocessor_UseTris
 	memory_copy(faces, & (A12_V3_S2) {
 			{ 0, 3, 2 }, // top
 			{ 0, 2, 1 }, // top
@@ -107,6 +120,19 @@ void cube128_init(A8_V3_S2* verts, A12_V3_S2* faces) {
 		},
 		size_of(A12_V3_S2)
 	);
+	#endif
+	#if GTE_Coprocessor_UseQuads
+	memory_copy(faces, & (A6_V4_S2) {
+			{ 3, 2, 0, 1 },
+			{ 0, 1, 4, 5 },
+			{ 4, 5, 7, 6 },
+			{ 1, 2, 5, 6 },
+			{ 2, 3, 6, 7 },
+			{ 3, 0, 7, 4 },
+		},
+		sizeof(A6_V4_S2)
+	);
+	#endif
 	return;
 }
 
@@ -122,8 +148,8 @@ typedef def_struct(SMemory) {
 
 	M3_S2 tform_world;
 
-	A8_V3_S2  cube_verts;
-	A12_V3_S2 cube_faces;
+	A8_V3_S2   cube_verts;
+	ACubeFaces cube_faces;
 };
 global SMemory static_mem;
 extern SMemory static_mem;
@@ -195,7 +221,7 @@ void update(PrimitiveArena* pa, U4* ordering_buf)
 	gte_matrix_set_rotation   (& static_mem.tform_world);
 	gte_matrix_set_translation(& static_mem.tform_world);
 
-#if 1
+#if GTE_Coprocessor_Chapter
 	S4 nclip = 0;
 	S4 orderingtbl_z = 0;
 	A2_S2 p;    //???
@@ -203,6 +229,7 @@ void update(PrimitiveArena* pa, U4* ordering_buf)
 
 	for (U4 face_id = 0; face_id < Cube_num_faces; face_id += 1)
 	{
+		#if GTE_Coprocessor_UseTris
 		Poly_G3* tri = prim_alloc(Poly_G3); set_poly_g3(tri);
 		tri->c0 = rgb8(255,   0, 255);
 		tri->c1 = rgb8(255, 255,   0);
@@ -231,8 +258,34 @@ void update(PrimitiveArena* pa, U4* ordering_buf)
 		if ((orderingtbl_z > 0) && (orderingtbl_z < OrderingTbl_Len)) {
 			orderingtbl_add_primitive(ordering_buf[orderingtbl_z], tri);
 		}
-	}
+		#endif
+		#if GTE_Coprocessor_UseQuads
+		Poly_G4* quad = prim_alloc(Poly_G4); set_poly_g4(quad);
+		quad->c0 = rgb8(255,   0, 255);
+		quad->c1 = rgb8(255, 255,   0);
+		quad->c2 = rgb8(  0, 255, 255);
+		quad->c3 = rgb8(  0, 255,   0);
 
+		V4_S2* face = & static_mem.cube_faces[face_id];
+		V3_S2* p0   = & static_mem.cube_verts[face->x];
+		V3_S2* p1   = & static_mem.cube_verts[face->y];
+		V3_S2* p2   = & static_mem.cube_verts[face->z];
+		V3_S2* p3   = & static_mem.cube_verts[face->w];
+
+		nclip = rtp_avg_nclip_a4_v3s2(
+			p0, p1, p2, p3,
+			& quad->p0, & quad->p1, & quad->p2, & quad->p3,
+			& p, & orderingtbl_z, & flag
+		);
+		if (nclip <= 0) {
+			continue;
+		}
+
+		if ((orderingtbl_z > 0) && (orderingtbl_z < OrderingTbl_Len)) {
+			orderingtbl_add_primitive(ordering_buf[orderingtbl_z], quad);
+		}
+		#endif
+	}
 	static_mem.rotation.x +=  6;
 	static_mem.rotation.y +=  8;
 	static_mem.rotation.z += 12;
diff --git a/code/graphics_hello_psyq/hello_gpu.h b/code/graphics_hello_psyq/hello_gpu.h
index f82ec78..dfb4993 100644
--- a/code/graphics_hello_psyq/hello_gpu.h
+++ b/code/graphics_hello_psyq/hello_gpu.h
@@ -117,10 +117,10 @@ M3_S2* m3s2_scale      (M3_S2* mat, V3_S4* vec) __asm__("ScaleMatrix");
 // Rotation, Translation, Perspective
 
 S4 rtp_v3s2_raw(V3_S2* vec, S4* xy, S4* pp, S4* flag) __asm__("RotTransPers");
-finline S4 rtp_v3s2(V3_S2* vec, V2_S2* xy, A2_S2* pp, S4* flag) { return rtp_v3s2_raw(vec, cast(S4*R_, & xy->x), cast(S4*R_, pp), r_(flag)); }
+FI_ S4 rtp_v3s2(V3_S2* vec, V2_S2* xy, A2_S2* pp, S4* flag) { return rtp_v3s2_raw(vec, cast(S4*R_, & xy->x), cast(S4*R_, pp), r_(flag)); }
 
 S4 rtp_avg_nclip_a3_v3s2_raw(V3_S2* v0, V3_S2* v1, V3_S2* v2, S4* xy1, S4* xy2, S4* xy3, S4* pp, S4* otz, S4* flag) __asm__("RotAverageNclip3");
-finline  S4 rtp_avg_nclip_a3_v3s2(
+FI_  S4 rtp_avg_nclip_a3_v3s2(
 	V3_S2* v0,  V3_S2* v1,  V3_S2* v2, 
 	V2_S2* xy0, V2_S2* xy1, V2_S2* xy2, 
 	A2_S2* pp, S4* otz, S4* flag
@@ -132,6 +132,19 @@ finline  S4 rtp_avg_nclip_a3_v3s2(
 	);
 }
 
+S4 rtp_avg_nclip_a4_v3s2_raw(V3_S2* v0, V3_S2* v1, V3_S2* v2, V3_S2* v3, S4* xy1, S4* xy2, S4* xy3, S4* xy4, S4* pp, S4* otz, S4* flag) __asm__("RotAverageNclip4");
+FI_ S4 rtp_avg_nclip_a4_v3s2(
+	V3_S2* v0,  V3_S2* v1,  V3_S2* v2,  V3_S2* v3,
+	V2_S2* xy0, V2_S2* xy1, V2_S2* xy2, V2_S2* xy3,
+	A2_S2* pp,  S4* otz,    S4* flag
+){
+	return rtp_avg_nclip_a4_v3s2_raw(
+		v0, v1, v2, v3,
+		cast(S4*R_, xy0), cast(S4*R_, xy1), cast(S4*R_, xy2), cast(S4*R_, xy3),
+		cast(S4*R_, pp),  cast(S4*R_, otz), cast(S4*R_, flag)
+	);
+}
+
 void gte_matrix_set_rotation   (M3_S2* mat) __asm__("SetRotMatrix");
 void gte_matrix_set_translation(M3_S2* mat) __asm__("SetTransMatrix");
 
diff --git a/scripts/build_psyq.ps1 b/scripts/build_psyq.ps1
index bd4e4b0..6872d39 100644
--- a/scripts/build_psyq.ps1
+++ b/scripts/build_psyq.ps1
@@ -30,9 +30,11 @@ $f_wall             = "-Wall"
 $f_wno_attributes   = "-Wno-attributes"
 
 # Optimization Flags
-$f_optimize_none    = "-O0" # For Debug builds
-$f_optimize_size    = "-Os" # For Release builds
-$f_omit_frame_ptr   = "-fomit-frame-pointer"
+$f_optimize_none       = "-O0"
+$f_optimize_size       = "-Os"
+$f_optimize_intrinsics = "-Oi"
+$f_optimize_debug      = "-Og"
+$f_omit_frame_ptr      = "-fomit-frame-pointer"
 
 # Environment & Standard Library Flags
 $f_no_stdlib        = "-nostdlib"
@@ -289,8 +291,10 @@ function build-graphis_hello {
 
 	$compile_args = @()
 	$compile_args += $f_debug
-	$compile_args += $f_optimize_none
+	# $compile_args += $f_optimize_none
+	# $compile_args += $f_optimize_intrinsics
 	# $compile_args += $f_optimize_size
+	$compile_args += $f_optimize_debug
 	$compile_args += ($f_include + $path_code)
 	compile-unit $src_c $module_c $includes $compile_args