improvements while working Sectr Prototype

2025-10-09 02:00:55 -07:00 · 2024-06-28 07:52:48 -04:00
parent 7e34131884
commit d454778fd6
9 changed files with 668 additions and 434 deletions
--- a/LRU.odin
+++ b/LRU.odin
@@ -28,11 +28,11 @@ PoolList :: struct {
 pool_list_init :: proc( pool : ^PoolList, capacity : u32, dbg_name : string = "" )
 {
 	error : AllocatorError
-	pool.items, error = make( [dynamic]PoolListItem, u64(capacity) )
+	pool.items, error = make( [dynamic]PoolListItem, int(capacity) )
 	assert( error == .None, "VEFontCache.pool_list_init : Failed to allocate items array")
 	resize( & pool.items, capacity )

-	pool.free_list, error = make( [dynamic]PoolListIter, u64(capacity) )
+	pool.free_list, error = make( [dynamic]PoolListIter, len = 0, cap = int(capacity) )
 	assert( error == .None, "VEFontCache.pool_list_init : Failed to allocate free_list array")
 	resize( & pool.free_list, capacity )

@@ -120,6 +120,23 @@ pool_list_erase :: proc( pool : ^PoolList, iter : PoolListIter )
 	}
 }

+pool_list_move_to_front :: #force_inline proc( pool : ^PoolList, iter : PoolListIter )
+{
+	using pool
+
+	if front == iter do return
+
+	item := & items[iter]
+	if item.prev != -1   do items[ item.prev ].next = item.next
+	if item.next != -1   do items[ item.next ].prev = item.prev
+	if back      == iter do back = item.prev
+
+	item.prev           = -1
+	item.next           = front
+	items[ front ].prev = iter
+	front               = iter
+}
+
 pool_list_peek_back :: #force_inline proc ( pool : ^PoolList ) -> PoolListValue {
 	assert( pool.back != - 1 )
 	value := pool.items[ pool.back ].value
@@ -181,12 +198,11 @@ LRU_find :: #force_inline proc "contextless" ( cache : ^LRU_Cache, key : u64, mu
 }

 LRU_get :: #force_inline proc( cache: ^LRU_Cache, key : u64 ) -> i32 {
-	iter, success := LRU_find( cache, key )
-	if success == false {
-		return -1
+	if link, ok := &cache.table[ key ]; ok {
+			pool_list_move_to_front(&cache.key_queue, link.ptr)
+			return link.value
 	}
-	LRU_refresh( cache, key )
-	return iter.value
+	return -1
 }

 LRU_get_next_evicted :: #force_inline proc ( cache : ^LRU_Cache ) -> u64
@@ -208,10 +224,9 @@ LRU_peek :: #force_inline proc ( cache : ^LRU_Cache, key : u64, must_find := fal

 LRU_put :: #force_inline proc( cache : ^LRU_Cache, key : u64, value : i32 ) -> u64
 {
-	iter, success := cache.table[key]
-	if success {
-		LRU_refresh( cache, key )
-		iter.value = value
+	if link, ok := & cache.table[ key ]; ok {
+		pool_list_move_to_front( & cache.key_queue, link.ptr )
+		link.value = value
 		return key
 	}

@@ -225,7 +240,7 @@ LRU_put :: #force_inline proc ( cache : ^LRU_Cache, key : u64,  value : i32 ) ->
 	pool_list_push_front(&cache.key_queue, key)
 	cache.table[key] = LRU_Link{
 			value = value,
-		ptr   = cache.key_queue.front
+			ptr   = cache.key_queue.front,
 	}
 	cache.num += 1
 	return evict
--- a/Readme.md
+++ b/Readme.md
@@ -1,29 +1,51 @@
 # VE Font Cache : Odin Port

-This is a port of the library base on the [original](https://github.com/hypernewbie/VEFontCache).
+This is a port of the library based on [fork](https://github.com/hypernewbie/VEFontCache)

 Its original purpose was for use in game engines, however its rendeirng quality and performance is more than adequate for many other applications.

 See: [docs/Readme.md](docs/Readme.md) for the library's interface

-TODO (Making it a more idiomatic library):
+## TODOs
+
+### (Making it a more idiomatic library):

 * Setup freetype, harfbuzz, depedency management within the library

-TODO Documentation:
+### Documentation:

 * Pureref outline of draw_text exectuion
 * Markdown general documentation

-TODO Content:
+### Content:

 * Port over the original demo utilizing sokol libraries instead
 * Provide a sokol_gfx backend package

-TODO Additional Features:
+### Additional Features:

 * Support for freetype
 * Support for harfbuzz
 * Ability to set a draw transform, viewport and projection
  * By default the library's position is in unsigned normalized render space
 * Allow curve_quality to be set on a per-font basis
+
+### Optimization:
+
+* Look into setting up multi-threading by giving each thread a context
+  * There is a heavy performance bottleneck in iterating the text/shape/glyphs on the cpu (single-thread) vs the actual rendering
+  * draw_text can provide in the context a job list per thread for the user to thenk hookup to their own threading solution to handle.
+  * Context would need to be segregated into staged data structures for each thread to utilize
+    * Each should have their own?
+      * draw_list
+      * draw_layer
+      * atlas.next_idx
+      * glyph_draw_buffer
+      * shape_cache
+    * This would need to converge to the singlar draw_list on a per layer basis (then user reqeusts a draw_list layer there could a yield to wait for the jobs to finish); if the interface expects the user to issue the commands single-threaded unless, we just assume the user is going to feed the gpu the commands & data through separate threads as well (not ideal ux).
+
+Failed Attempts:
+
+* Attempted to chunk the text to more granular 'shapes' from `draw_list` before doing the actual call to `draw_text_shape`. This lead to a larger performance cost due to the additional iteration across the text string.
+* Attempted to cache the shape draw_list for future calls. Led to larger performance cost due to additional iteration in the `merge_draw_list`. 
+  * The shapes glyphs must still be traversed to identify if the glyph is cached. This arguably could be handled in `shape_text_uncached`, however that would require a significan't amount of refactoring to identify... (and would be more unergonomic when shapers libs are processing the text)
--- a/VEFontCache.odin
+++ b/VEFontCache.odin
@@ -44,7 +44,7 @@ Context :: struct {

 	entries : [dynamic]Entry,

-	temp_path               : [dynamic]Vec2,
+	temp_path               : [dynamic]Vertex,
 	temp_codepoint_seen     : map[u64]bool,
 	temp_codepoint_seen_num : u32,

@@ -133,8 +133,8 @@ InitShapeCacheParams :: struct {
 }

 InitShapeCacheParams_Default :: InitShapeCacheParams {
-	capacity       = 1024,
-	reserve_length = 1024,
+	capacity       = 2048,
+	reserve_length = 2048,
 }

 // ve_fontcache_init
@@ -145,8 +145,8 @@ startup :: proc( ctx : ^Context, parser_kind : ParserKind,
 	shape_cache_params          := InitShapeCacheParams_Default,
 	curve_quality               : u32 = 3,
 	entires_reserve             : u32 = 512,
-	temp_path_reserve           : u32 = 512,
-	temp_codepoint_seen_reserve : u32 = 512,
+	temp_path_reserve           : u32 = 1024,
+	temp_codepoint_seen_reserve : u32 = 2048,
 )
 {
 	assert( ctx != nil, "Must provide a valid context" )
@@ -161,25 +161,26 @@ startup :: proc( ctx : ^Context, parser_kind : ParserKind,
 	ctx.curve_quality = curve_quality

 	error : AllocatorError
-	entries, error = make( [dynamic]Entry, entires_reserve )
+	entries, error = make( [dynamic]Entry, len = 0, cap = entires_reserve )
 	assert(error == .None, "VEFontCache.init : Failed to allocate entries")

-	temp_path, error = make( [dynamic]Vec2, temp_path_reserve )
+	temp_path, error = make( [dynamic]Vertex, len = 0, cap = temp_path_reserve )
 	assert(error == .None, "VEFontCache.init : Failed to allocate temp_path")

 	temp_codepoint_seen, error = make( map[u64]bool, uint(temp_codepoint_seen_reserve) )
 	assert(error == .None, "VEFontCache.init : Failed to allocate temp_path")

-	draw_list.vertices, error = make( [dynamic]Vertex, 4 * Kilobyte )
+	draw_list.vertices, error = make( [dynamic]Vertex, len = 0, cap = 4 * Kilobyte )
 	assert(error == .None, "VEFontCache.init : Failed to allocate draw_list.vertices")

-	draw_list.indices, error = make( [dynamic]u32, 8 * Kilobyte )
+	draw_list.indices, error = make( [dynamic]u32, len = 0, cap = 8 * Kilobyte )
 	assert(error == .None, "VEFontCache.init : Failed to allocate draw_list.indices")

-	draw_list.calls, error = make( [dynamic]DrawCall, 512 )
+	draw_list.calls, error = make( [dynamic]DrawCall, len = 0, cap = 512 )
 	assert(error == .None, "VEFontCache.init : Failed to allocate draw_list.calls")

-	init_atlas_region :: proc( region : ^AtlasRegion, params : InitAtlasParams, region_params : InitAtlasRegionParams, factor : Vec2i, expected_cap : i32 ) {
+	init_atlas_region :: proc( region : ^AtlasRegion, params : InitAtlasParams, region_params : InitAtlasRegionParams, factor : Vec2i, expected_cap : i32 )
+	{
 		using region

 		next_idx = 0;
@@ -225,11 +226,20 @@ startup :: proc( ctx : ^Context, parser_kind : ParserKind,
 	for idx : u32 = 0; idx < shape_cache_params.capacity; idx += 1 {
 		stroage_entry := & shape_cache.storage[idx]
 		using stroage_entry
-		glyphs, error = make( [dynamic]Glyph, shape_cache_params.reserve_length )
+		glyphs, error = make( [dynamic]Glyph, len = 0, cap = shape_cache_params.reserve_length )
 		assert( error == .None, "VEFontCache.init : Failed to allocate glyphs array for shape cache storage" )

-		positions, error = make( [dynamic]Vec2, shape_cache_params.reserve_length )
+		positions, error = make( [dynamic]Vec2, len = 0, cap = shape_cache_params.reserve_length )
 		assert( error == .None, "VEFontCache.init : Failed to allocate positions array for shape cache storage" )
+
+		draw_list.calls, error = make( [dynamic]DrawCall, len = 0, cap = glyph_draw_params.buffer_batch * 2 )
+		assert( error == .None, "VEFontCache.init : Failed to allocate calls for draw_list" )
+
+		draw_list.indices, error = make( [dynamic]u32, len = 0, cap = glyph_draw_params.buffer_batch * 2 * 6 )
+		assert( error == .None, "VEFontCache.init : Failed to allocate indices array for draw_list" )
+
+		draw_list.vertices, error = make( [dynamic]Vertex, len = 0, cap = glyph_draw_params.buffer_batch * 2 * 4 )
+		assert( error == .None, "VEFontCache.init : Failed to allocate vertices array for draw_list" )
 	}

 	// Note(From original author): We can actually go over VE_FONTCACHE_GLYPHDRAW_BUFFER_BATCH batches due to smart packing!
@@ -241,22 +251,22 @@ startup :: proc( ctx : ^Context, parser_kind : ParserKind,
 		height        = atlas.region_d.height * u32(over_sample.y)
 		draw_padding  = glyph_draw_params.draw_padding

-		draw_list.calls, error = make( [dynamic]DrawCall, cast(u64) glyph_draw_params.buffer_batch * 2 )
+		draw_list.calls, error = make( [dynamic]DrawCall, len = 0, cap = glyph_draw_params.buffer_batch * 2 )
 		assert( error == .None, "VEFontCache.init : Failed to allocate calls for draw_list" )

-		draw_list.indices, error = make( [dynamic]u32, cast(u64) glyph_draw_params.buffer_batch * 2 * 6 )
+		draw_list.indices, error = make( [dynamic]u32, len = 0, cap = glyph_draw_params.buffer_batch * 2 * 6 )
 		assert( error == .None, "VEFontCache.init : Failed to allocate indices array for draw_list" )

-		draw_list.vertices, error = make( [dynamic]Vertex, glyph_draw_params.buffer_batch * 2 * 4 )
+		draw_list.vertices, error = make( [dynamic]Vertex, len = 0, cap = glyph_draw_params.buffer_batch * 2 * 4 )
 		assert( error == .None, "VEFontCache.init : Failed to allocate vertices array for draw_list" )

-		clear_draw_list.calls, error = make( [dynamic]DrawCall, cast(u64) glyph_draw_params.buffer_batch * 2 )
+		clear_draw_list.calls, error = make( [dynamic]DrawCall, len = 0, cap = glyph_draw_params.buffer_batch * 2 )
 		assert( error == .None, "VEFontCache.init : Failed to allocate calls for calls for clear_draw_list" )

-		clear_draw_list.indices, error = make( [dynamic]u32, cast(u64) glyph_draw_params.buffer_batch * 2 * 4 )
+		clear_draw_list.indices, error = make( [dynamic]u32, len = 0, cap = glyph_draw_params.buffer_batch * 2 * 4 )
 		assert( error == .None, "VEFontCache.init : Failed to allocate calls for indices array for clear_draw_list" )

-		clear_draw_list.vertices, error = make( [dynamic]Vertex, glyph_draw_params.buffer_batch * 2 * 4 )
+		clear_draw_list.vertices, error = make( [dynamic]Vertex, len = 0, cap = glyph_draw_params.buffer_batch * 2 * 4 )
 		assert( error == .None, "VEFontCache.init : Failed to allocate vertices array for clear_draw_list" )
 	}

@@ -395,7 +405,7 @@ configure_snap :: #force_inline proc( ctx : ^Context, snap_width, snap_height :
 get_cursor_pos :: #force_inline proc "contextless" ( ctx : ^Context                  ) -> Vec2 { return ctx.cursor_pos }
 set_colour     :: #force_inline proc "contextless" ( ctx : ^Context, colour : Colour )         { ctx.colour = colour }

-draw_text :: proc( ctx : ^Context, font : FontID, text_utf8 : string, position : Vec2, scale : Vec2 ) -> b32
+draw_text :: proc( ctx : ^Context, font : FontID, text_utf8 : string, position, scale : Vec2 ) -> b32
 {
 	// profile(#procedure)
 	assert( ctx != nil )
@@ -471,24 +481,9 @@ measure_text_size :: proc( ctx : ^Context, font : FontID, text_utf8 : string ) -
 	assert( ctx != nil )
 	assert( font >= 0 && int(font) < len(ctx.entries) )

-	atlas   := ctx.atlas
 	entry  := &ctx.entries[font]
 	shaped := shape_text_cached(ctx, font, text_utf8, entry)
-	padding := cast(f32) atlas.glyph_padding
-
-	for index : i32 = 0; index < i32(len(shaped.glyphs)); index += 1
-	{
-		glyph_index := shaped.glyphs[ index ]
-		if is_empty( ctx, entry, glyph_index ) do continue
-
-		bounds_0, bounds_1 := parser_get_glyph_box( & entry.parser_info, glyph_index )
-		bounds_size := bounds_1 - bounds_0
-
-		glyph_size := Vec2 { f32(bounds_size.x), f32(bounds_size.y) } * entry.size_scale
-		measured.y = max(measured.y, glyph_size.y)
-	}
-	measured.x = shaped.end_cursor_pos.x
-	return measured
+	return shaped.size
 }

 get_font_vertical_metrics :: #force_inline proc ( ctx : ^Context, font : FontID ) -> ( ascent, descent, line_gap : i32 )
--- a/atlas.odin
+++ b/atlas.odin
@@ -86,11 +86,77 @@ atlas_bbox :: proc( atlas : ^Atlas, region : AtlasRegionKind, local_idx : i32 )
 	return
 }

+// decide_codepoint_region :: proc( ctx : ^Context, entry : ^Entry, glyph_index : Glyph
+// ) -> (region_kind : AtlasRegionKind, region : ^AtlasRegion, over_sample : Vec2)
+// {
+// 	if parser_is_glyph_empty( & entry.parser_info, glyph_index ) {
+// 		region_kind = .None
+// 	}
+
+// 	bounds_0, bounds_1 := parser_get_glyph_box( & entry.parser_info, glyph_index )
+// 	bounds_width  := f32(bounds_1.x - bounds_0.x)
+// 	bounds_height := f32(bounds_1.y - bounds_0.y)
+
+// 	atlas        := & ctx.atlas
+// 	glyph_buffer := & ctx.glyph_buffer
+
+// 	glyph_padding := f32(atlas.glyph_padding) * 2
+
+// 	bounds_width_scaled  := cast(u32) (bounds_width  * entry.size_scale + glyph_padding)
+// 	bounds_height_scaled := cast(u32) (bounds_height * entry.size_scale + glyph_padding)
+
+// 	if bounds_width_scaled <= atlas.region_a.width && bounds_height_scaled <= atlas.region_a.height
+// 	{
+// 		// Region A for small glyphs. These are good for things such as punctuation.
+// 		region_kind = .A
+// 		region      = & atlas.region_a
+// 	}
+// 	else if bounds_width_scaled <= atlas.region_b.width && bounds_height_scaled <= atlas.region_b.height
+// 	{
+// 		// Region B for tall glyphs. These are good for things such as european alphabets.
+// 		region_kind = .B
+// 		region      = & atlas.region_b
+// 	}
+// 	else if bounds_width_scaled <= atlas.region_c.width && bounds_height_scaled <= atlas.region_c.height
+// 	{
+// 		// Region C for big glyphs. These are good for things such as asian typography.
+// 		region_kind = .C
+// 		region      = & atlas.region_c
+// 	}
+// 	else if bounds_width_scaled <= atlas.region_d.width && bounds_height_scaled <= atlas.region_d.height
+// 	{
+// 		// Region D for huge glyphs. These are good for things such as titles and 4k.
+// 		region_kind = .D
+// 		region      = & atlas.region_d
+// 	}
+// 	else if bounds_width_scaled <= glyph_buffer.width && bounds_height_scaled <= glyph_buffer.height
+// 	{
+// 		// Region 'E' for massive glyphs. These are rendered uncached and un-oversampled.
+// 		region_kind = .E
+// 		region      = nil
+// 		if bounds_width_scaled <= glyph_buffer.width / 2 && bounds_height_scaled <= glyph_buffer.height / 2 {
+// 			over_sample = { 2.0, 2.0 }
+// 		}
+// 		else {
+// 			over_sample = { 1.0, 1.0 }
+// 		}
+// 		return
+// 	}
+// 	else {
+// 		region_kind = .None
+// 		return
+// 	}
+
+// 	over_sample = glyph_buffer.over_sample
+// 	assert(region != nil)
+// 	return
+// }
+
 decide_codepoint_region :: proc(ctx : ^Context, entry : ^Entry, glyph_index : Glyph
 ) -> (region_kind : AtlasRegionKind, region : ^AtlasRegion, over_sample : Vec2)
 {
 	if parser_is_glyph_empty(&entry.parser_info, glyph_index) {
-		region_kind = .None
+		return .None, nil, {}
 	}

 	bounds_0, bounds_1 := parser_get_glyph_box(&entry.parser_info, glyph_index)
@@ -99,55 +165,31 @@ decide_codepoint_region :: proc( ctx : ^Context, entry : ^Entry, glyph_index : G

 	atlas         := & ctx.atlas
 	glyph_buffer  := & ctx.glyph_buffer
-
 	glyph_padding := f32( atlas.glyph_padding ) * 2

-	bounds_width_scaled  := cast(u32) (bounds_width  * entry.size_scale + glyph_padding)
-	bounds_height_scaled := cast(u32) (bounds_height * entry.size_scale + glyph_padding)
+	bounds_width_scaled  := u32(bounds_width  * entry.size_scale + glyph_padding)
+	bounds_height_scaled := u32(bounds_height * entry.size_scale + glyph_padding)

-	if bounds_width_scaled <= atlas.region_a.width && bounds_height_scaled <= atlas.region_a.height
-	{
-		// Region A for small glyphs. These are good for things such as punctuation.
-		region_kind = .A
-		region      = & atlas.region_a
-	}
-	else if bounds_width_scaled <= atlas.region_b.width && bounds_height_scaled <= atlas.region_b.height
-	{
-		// Region B for tall glyphs. These are good for things such as european alphabets.
-		region_kind = .B
-		region      = & atlas.region_b
-	}
-	else if bounds_width_scaled <= atlas.region_c.width && bounds_height_scaled <= atlas.region_c.height
-	{
-		// Region C for big glyphs. These are good for things such as asian typography.
-		region_kind = .C
-		region      = & atlas.region_c
-	}
-	else if bounds_width_scaled <= atlas.region_d.width && bounds_height_scaled <= atlas.region_d.height
-	{
-		// Region D for huge glyphs. These are good for things such as titles and 4k.
-		region_kind = .D
-		region      = & atlas.region_d
-	}
-	else if bounds_width_scaled <= glyph_buffer.width && bounds_height_scaled <= glyph_buffer.height
-	{
-		// Region 'E' for massive glyphs. These are rendered uncached and un-oversampled.
-		region_kind = .E
-		region      = nil
-		if bounds_width_scaled <= glyph_buffer.width / 2 && bounds_height_scaled <= glyph_buffer.height / 2 {
-			over_sample = { 2.0, 2.0 }
-		}
-		else {
-			over_sample = { 1.0, 1.0 }
-		}
-		return
-	}
-	else {
-		region_kind = .None
-		return
+	// Use a lookup table for faster region selection
+	region_lookup := [4]struct { kind: AtlasRegionKind, region: ^AtlasRegion } {
+			{ .A, & atlas.region_a },
+			{ .B, & atlas.region_b },
+			{ .C, & atlas.region_c },
+			{ .D, & atlas.region_d },
 	}

-	over_sample = glyph_buffer.over_sample
-	assert(region != nil)
-	return
+	for region in region_lookup do if bounds_width_scaled <= region.region.width && bounds_height_scaled <= region.region.height {
+		return region.kind, region.region, glyph_buffer.over_sample
+	}
+
+	if bounds_width_scaled  <= glyph_buffer.width \
+	&& bounds_height_scaled <= glyph_buffer.height {
+		over_sample = \
+			bounds_width_scaled  <= glyph_buffer.width  / 2 &&
+			bounds_height_scaled <= glyph_buffer.height / 2 ? \
+			  {2.0, 2.0} \
+			: {1.0, 1.0}
+		return .E, nil, over_sample
+	}
+	return .None, nil, {}
 }
--- a/docs/draw_text_codepaths.pur
+++ b/docs/draw_text_codepaths.pur
--- a/draw.odin
+++ b/draw.odin
@@ -56,37 +56,31 @@ blit_quad :: proc( draw_list : ^DrawList, p0 : Vec2 = {0, 0}, p1 : Vec2 = {1, 1}
 		// p0.x, p0.y, p1.x, p1.y, uv0.x, uv0.y, uv1.x, uv1.y);
 	v_offset := cast(u32) len(draw_list.vertices)

-	vertex := Vertex {
+	quadv : [4]Vertex = {
+		{
 			{p0.x, p0.y},
 			uv0.x, uv0.y
-	}
-	append_elem( & draw_list.vertices, vertex )
-
-	vertex = Vertex {
+		},
+		{
 			{p0.x, p1.y},
 			uv0.x, uv1.y
-	}
-	append_elem( & draw_list.vertices, vertex )
-
-	vertex = Vertex {
+		},
+		{
 			{p1.x, p0.y},
 			uv1.x, uv0.y
-	}
-	append_elem( & draw_list.vertices, vertex )
-
-	vertex = Vertex {
+		},
+		{
 			{p1.x, p1.y},
 			uv1.x, uv1.y
 		}
-	append_elem( & draw_list.vertices, vertex )
+	}
+	append( & draw_list.vertices, ..quadv[:] )

 	quad_indices : []u32 = {
-		0, 1, 2,
-		2, 1, 3
-	}
-	for index : i32 = 0; index < 6; index += 1 {
-		append( & draw_list.indices, v_offset + quad_indices[ index ] )
+		0 + v_offset, 1 + v_offset, 2 + v_offset,
+		2 + v_offset, 1 + v_offset, 3 + v_offset
 	}
+	append( & draw_list.indices, ..quad_indices[:] )
 	return
 }

@@ -94,108 +88,72 @@ cache_glyph :: proc( ctx : ^Context, font : FontID, glyph_index : Glyph, entry :
 {
 	// profile(#procedure)
 	if glyph_index == Glyph(0) {
-		// Note(Original Author): Glyph not in current hb_font
 		return false
 	}

-	// Retrieve the shape definition from the parser.
 	shape, error := parser_get_glyph_shape(&entry.parser_info, glyph_index)
 	assert(error == .None)
 	if len(shape) == 0 {
 		return false
 	}

-	if ctx.debug_print_verbose
-	{
-		log( "shape:")
-		for vertex in shape
-		{
-			if vertex.type == .Move {
-				logf("move_to %d %d", vertex.x, vertex.y )
-			}
-			else if vertex.type == .Line {
-				logf("line_to %d %d", vertex.x, vertex.y )
-			}
-			else if vertex.type == .Curve {
-				logf("curve_to %d %d through %d %d", vertex.x, vertex.y, vertex.contour_x0, vertex.contour_y0 )
-			}
-			else if vertex.type == .Cubic {
-				logf("cubic_to %d %d through %d %d and %d %d",
-					vertex.x, vertex.y,
-					vertex.contour_x0, vertex.contour_y0,
-					vertex.contour_x1, vertex.contour_y1 )
-			}
-		}
-	}
+	outside := Vec2{bounds_0.x - 21, bounds_0.y - 33}

-	/*
-	Note(Original Author):
-	We need a random point that is outside our shape. We simply pick something diagonally across from top-left bound corner.
-	Note that this outside point is scaled alongside the glyph in ve_fontcache_draw_filled_path, so we don't need to handle that here.
-	*/
-	outside := Vec2 {
-		bounds_0.x - 21,
-		bounds_0.y - 33,
-	}
-
-	// Note(Original Author): Figure out scaling so it fits within our box.
 	draw            := DrawCall_Default
 	draw.pass        = FrameBufferPass.Glyph
 	draw.start_index = u32(len(ctx.draw_list.indices))

-	// Note(Original Author);
-	// Draw the path using simplified version of https://medium.com/@evanwallace/easy-scalable-text-rendering-on-the-gpu-c3f4d782c5ac.
-	// Instead of involving fragment shader code we simply make use of modern GPU ability to crunch triangles and brute force curve definitions.
-	path := ctx.temp_path
-	clear( & path)
-	for edge in shape	do switch edge.type
-	{
+	path := &ctx.temp_path
+	clear(path)
+
+	append_bezier_curve :: #force_inline proc(path: ^[dynamic]Vertex, p0, p1, p2: Vec2, quality: u32) {
+		step := 1.0 / f32(quality)
+		for index := u32(1); index <= quality; index += 1 {
+			alpha := f32(index) * step
+			append( path, Vertex { pos = eval_point_on_bezier3(p0, p1, p2, alpha) } )
+		}
+	}
+
+	append_bezier_curve_cubic :: #force_inline proc(path: ^[dynamic]Vertex, p0, p1, p2, p3: Vec2, quality: u32) {
+		step := 1.0 / f32(quality)
+		for index := u32(1); index <= quality; index += 1 {
+			alpha := f32(index) * step
+			append( path, Vertex { pos = eval_point_on_bezier4(p0, p1, p2, p3, alpha) } )
+		}
+	}
+
+	for edge in shape do #partial switch edge.type {
 		case .Move:
 			if len(path) > 0 {
 					draw_filled_path(&ctx.draw_list, outside, path[:], scale, translate, ctx.debug_print_verbose)
+					clear(path)
 			}
-			clear( & path)
 			fallthrough

 		case .Line:
-			append( & path, Vec2{ f32(edge.x), f32(edge.y) })
+			append( path, Vertex { pos = Vec2 { f32(edge.x), f32(edge.y)} } )

 		case .Curve:
 			assert(len(path) > 0)
-			p0 := path[ len(path) - 1 ]
+			p0 := path[ len(path) - 1].pos
 			p1 := Vec2{ f32(edge.contour_x0), f32(edge.contour_y0) }
 			p2 := Vec2{ f32(edge.x), f32(edge.y) }
-
-			step  := 1.0 / f32(ctx.curve_quality)
-			alpha := step
-			for index := i32(0); index < i32(ctx.curve_quality); index += 1 {
-				append( & path, eval_point_on_bezier3( p0, p1, p2, alpha ))
-				alpha += step
-			}
+			append_bezier_curve( path, p0, p1, p2, ctx.curve_quality )

 		case .Cubic:
 			assert( len(path) > 0)
-			p0 := path[ len(path) - 1]
+			p0 := path[ len(path) - 1].pos
 			p1 := Vec2{ f32(edge.contour_x0), f32(edge.contour_y0) }
 			p2 := Vec2{ f32(edge.contour_x1), f32(edge.contour_y1) }
 			p3 := Vec2{ f32(edge.x), f32(edge.y) }
-
-			step  := 1.0 / f32(ctx.curve_quality)
-			alpha := step
-			for index := i32(0); index < i32(ctx.curve_quality); index += 1 {
-				append( & path, eval_point_on_bezier4( p0, p1, p2, p3, alpha ))
-				alpha += step
+			append_bezier_curve_cubic( path, p0, p1, p2, p3, ctx.curve_quality )
 	}

-		case .None:
-			assert(false, "Unknown edge type or invalid")
-	}
 	if len(path) > 0 {
 		draw_filled_path(&ctx.draw_list, outside, path[:], scale, translate, ctx.debug_print_verbose)
 	}

-	// Note(Original Author): Apend the draw call
-	draw.end_index = cast(u32) len(ctx.draw_list.indices)
+	draw.end_index = u32(len(ctx.draw_list.indices))
 	if draw.end_index > draw.start_index {
 		append(&ctx.draw_list.calls, draw)
 	}
@@ -301,10 +259,9 @@ cache_glyph_to_atlas :: proc( ctx : ^Context,
 	glyph_buffer.batch_x   += i32(gwidth_scaled_px)
 	screenspace_x_form( & glyph_draw_translate, & glyph_draw_scale, glyph_buffer_size )

-	call : DrawCall
+	clear_target_region : DrawCall
 	{
-		// Queue up clear on target region on atlas
-		using call
+		using clear_target_region
 		pass        = .Atlas
 		region      = .Ignore
 		start_index = cast(u32) len(glyph_buffer.clear_draw_list.indices)
@@ -314,9 +271,12 @@ cache_glyph_to_atlas :: proc( ctx : ^Context,
 			{ 1.0, 1.0 },  { 1.0, 1.0 } )

 		end_index = cast(u32) len(glyph_buffer.clear_draw_list.indices)
-		append( & glyph_buffer.clear_draw_list.calls, call )
+	}

-		// Queue up a blit from glyph_update_FBO to the atlas
+	blit_to_atlas : DrawCall
+	{
+		using blit_to_atlas
+		pass        = .Atlas
 		region      = .None
 		start_index = cast(u32) len(glyph_buffer.draw_list.indices)

@@ -325,14 +285,17 @@ cache_glyph_to_atlas :: proc( ctx : ^Context,
 			src_position,       src_position  + src_size )

 		end_index = cast(u32) len(glyph_buffer.draw_list.indices)
-		append( & glyph_buffer.draw_list.calls, call )
 	}

+	append( & glyph_buffer.clear_draw_list.calls, clear_target_region )
+	append( & glyph_buffer.draw_list.calls, blit_to_atlas )
+
 	// Render glyph to glyph_update_FBO
 	cache_glyph( ctx, font, glyph_index, entry, vec2(bounds_0), vec2(bounds_1), glyph_draw_scale, glyph_draw_translate )
 }

-can_batch_glyph :: #force_inline proc( ctx : ^Context, font : FontID, entry : ^Entry, glyph_index : Glyph,
+// If the glyuph is found in the atlas, nothing occurs, otherwise, the glyph call is setup to catch it to the atlas
+check_glyph_in_atlas :: #force_inline proc( ctx : ^Context, font : FontID, entry : ^Entry, glyph_index : Glyph,
 	lru_code    : u64,
 	atlas_index : i32,
 	region_kind : AtlasRegionKind,
@@ -415,9 +378,11 @@ directly_draw_massive_glyph :: proc( ctx : ^Context,
 	textspace_x_form( & glyph_position, & glyph_size, glyph_buffer_size )

 	// Add the glyph drawcall.
-	call : DrawCall
+	calls : [2]DrawCall
+
+	draw_to_target := & calls[0]
 	{
-		using call
+		using draw_to_target
 		pass        = .Target_Uncached
 		colour      = ctx.colour
 		start_index = u32(len(ctx.draw_list.indices))
@@ -427,18 +392,20 @@ directly_draw_massive_glyph :: proc( ctx : ^Context,
 				glyph_position, glyph_position + glyph_size )

 		end_index = u32(len(ctx.draw_list.indices))
-		append( & ctx.draw_list.calls, call )
 	}

+	clear_glyph_update := & calls[1]
+	{
 		// Clear glyph_update_FBO.
-	call.pass              = .Glyph
-	call.start_index       = 0
-	call.end_index         = 0
-	call.clear_before_draw = true
-	append( & ctx.draw_list.calls, call )
+		clear_glyph_update.pass              = .Glyph
+		clear_glyph_update.start_index       = 0
+		clear_glyph_update.end_index         = 0
+		clear_glyph_update.clear_before_draw = true
+	}
+	append( & ctx.draw_list.calls, ..calls[:] )
 }

-draw_cached_glyph :: proc( ctx : ^Context,
+draw_cached_glyph :: proc( ctx : ^Context, shaped : ^ShapedText,
 	entry              : ^Entry,
 	glyph_index        : Glyph,
 	lru_code           : u64,
@@ -480,12 +447,30 @@ draw_cached_glyph :: proc( ctx : ^Context,
 	bounds_0_scaled := bounds_0 * entry.size_scale //- { 0.5, 0.5 }
 	bounds_0_scaled  = ceil(bounds_0_scaled)

-	dst       := position + bounds_0_scaled * scale
-	dst       -= glyph_padding * scale
+	dst       := position + (bounds_0_scaled - glyph_padding) * scale
 	dst_scale := glyph_scale * scale

 	textspace_x_form( & slot_position, & glyph_scale, atlas_size )

+	// Shape call setup
+	when false
+	{
+		call := DrawCall_Default
+		{
+			using call
+			pass        = .Target
+			colour      = ctx.colour
+			start_index = cast(u32) len(shaped.draw_list.indices)
+
+			blit_quad( & shaped.draw_list,
+				dst,           dst           + dst_scale,
+				slot_position, slot_position + glyph_scale )
+			end_index   = cast(u32) len(shaped.draw_list.indices)
+		}
+		append( & shaped.draw_list.calls, call )
+	}
+	else
+	{
 		// Add the glyph drawcall
 		call := DrawCall_Default
 		{
@@ -500,6 +485,7 @@ draw_cached_glyph :: proc( ctx : ^Context,
 			end_index   = cast(u32) len(ctx.draw_list.indices)
 		}
 		append( & ctx.draw_list.calls, call )
+	}
 	return true
 }

@@ -509,7 +495,7 @@ draw_cached_glyph :: proc( ctx : ^Context,
 // Note(Original Author):
 // WARNING: doesn't actually append drawcall; caller is responsible for actually appending the drawcall.
 // ve_fontcache_draw_filled_path
-draw_filled_path :: proc( draw_list : ^DrawList, outside_point : Vec2, path : []Vec2,
+draw_filled_path :: proc( draw_list : ^DrawList, outside_point : Vec2, path : []Vertex,
 	scale     := Vec2 { 1, 1 },
 	translate := Vec2 { 0, 0 },
 	debug_print_verbose : b32 = false
@@ -519,19 +505,16 @@ draw_filled_path :: proc( draw_list : ^DrawList, outside_point : Vec2, path : []
 	{
 		log("outline_path:")
 		for point in path {
-			vec := point * scale + translate
+			vec := point.pos * scale + translate
 			logf(" %0.2f %0.2f", vec.x, vec.y )
 		}
 	}

 	v_offset := cast(u32) len(draw_list.vertices)
 	for point in path {
-		vertex := Vertex {
-			pos = point * scale + translate,
-			u = 0,
-			v = 0,
-		}
-		append( & draw_list.vertices, vertex )
+		point := point
+		point.pos = point.pos * scale + translate
+		append( & draw_list.vertices, point )
 	}

 	outside_vertex := cast(u32) len(draw_list.vertices)
@@ -546,9 +529,12 @@ draw_filled_path :: proc( draw_list : ^DrawList, outside_point : Vec2, path : []

 	for index : u32 = 1; index < cast(u32) len(path); index += 1 {
 		indices := & draw_list.indices
-		append( indices, outside_vertex )
-		append( indices, v_offset + index - 1 )
-		append( indices, v_offset + index )
+		to_add := [3]u32 {
+			outside_vertex,
+			v_offset + index - 1,
+			v_offset + index
+		}
+		append( indices, ..to_add[:] )
 	}
 }

@@ -558,30 +544,56 @@ draw_text_batch :: proc( ctx : ^Context, entry : ^Entry, shaped : ^ShapedText,
 	snap_width, snap_height        : f32 )
 {
 	flush_glyph_buffer_to_atlas(ctx)
+
+	atlas         := & ctx.atlas
+	atlas_size    := Vec2{ f32(atlas.width), f32(atlas.height) }
+	glyph_padding := f32(atlas.glyph_padding)
+
 	for index := batch_start_idx; index < batch_end_idx; index += 1
 	{
 			glyph_index := shaped.glyphs[index]

-		if glyph_index == 0                                          do continue
-		if parser_is_glyph_empty( & entry.parser_info, glyph_index ) do continue
+			if glyph_index == 0 || parser_is_glyph_empty( & entry.parser_info, glyph_index) do continue

 			region_kind, region, over_sample := decide_codepoint_region( ctx, entry, glyph_index )
 			lru_code                         := font_glyph_lru_code( entry.id, glyph_index )
-		atlas_index                      := cast(i32) -1
-
-		if region_kind != .E do atlas_index = LRU_get( & region.state, lru_code )
+			atlas_index                      := region_kind != .E ? LRU_get( & region.state, lru_code ) : -1
 			bounds_0, bounds_1               := parser_get_glyph_box( & entry.parser_info, glyph_index )
+			vbounds_0   := vec2(bounds_0)
+			vbounds_1   := vec2(bounds_1)
+			bounds_size := Vec2 { vbounds_1.x - vbounds_0.x, vbounds_1.y - vbounds_0.y }

 			shaped_position := shaped.positions[index]
 			glyph_translate := position + shaped_position * scale

-		glyph_cached := draw_cached_glyph( ctx,
-			entry,       glyph_index,
-			lru_code,    atlas_index,
-			vec2(bounds_0), vec2(bounds_1),
-			region_kind, region, over_sample,
-			glyph_translate, scale)
-		assert( glyph_cached == true )
+			if region_kind == .E
+			{
+					directly_draw_massive_glyph(ctx, entry, glyph_index,
+						vbounds_0, vbounds_1,
+						bounds_size,
+						over_sample, glyph_translate, scale )
+			}
+			else if atlas_index != -1
+			{
+					slot_position, _ := atlas_bbox( atlas, region_kind, atlas_index )
+					glyph_scale      := bounds_size * entry.size_scale + glyph_padding
+					bounds_0_scaled  := ceil( vbounds_0 * entry.size_scale )
+					dst              := glyph_translate + (bounds_0_scaled - glyph_padding) * scale
+					dst_scale        := glyph_scale * scale
+					textspace_x_form( & slot_position, & glyph_scale, atlas_size )
+
+					call             := DrawCall_Default
+					call.pass         = .Target
+					call.colour       = ctx.colour
+					call.start_index  = u32(len(ctx.draw_list.indices))
+
+					blit_quad(&ctx.draw_list, 
+						dst,           dst           + dst_scale,
+						slot_position, slot_position + glyph_scale )
+
+					call.end_index = u32(len(ctx.draw_list.indices))
+					append(&ctx.draw_list.calls, call)
+			}
 	}
 }

@@ -594,7 +606,6 @@ draw_text_shape :: proc( ctx : ^Context,
 	snap_width, snap_height : f32
 ) -> (cursor_pos : Vec2)
 {
-	// position := position //+ ctx.cursor_pos * scale
 	// profile(#procedure)
 	batch_start_idx : i32 = 0
 	for index : i32 = 0; index < cast(i32) len(shaped.glyphs); index += 1
@@ -607,9 +618,9 @@ draw_text_shape :: proc( ctx : ^Context,
 		atlas_index                      := cast(i32) -1

 		if region_kind != .E do atlas_index = LRU_get( & region.state, lru_code )
-		if can_batch_glyph( ctx, font, entry, glyph_index, lru_code, atlas_index, region_kind, region, over_sample ) do continue
+		if check_glyph_in_atlas( ctx, font, entry, glyph_index, lru_code, atlas_index, region_kind, region, over_sample ) do continue

-		// Glyph has not been catched, needs to be directly drawn.
+		// We can no longer directly append the shape as it has missing glyphs in the atlas

 		// First batch the other cached glyphs
 		// flush_glyph_buffer_to_atlas(ctx)
@@ -621,10 +632,10 @@ draw_text_shape :: proc( ctx : ^Context,
 		batch_start_idx = index
 	}

-	// flush_glyph_buffer_to_atlas(ctx)
 	draw_text_batch( ctx, entry, shaped, batch_start_idx, cast(i32) len(shaped.glyphs), position, scale, snap_width , snap_height )
 	reset_batch_codepoint_state( ctx )
-	cursor_pos = shaped.end_cursor_pos
+
+	cursor_pos = position + shaped.end_cursor_pos * scale
 	return
 }

@@ -650,6 +661,34 @@ flush_glyph_buffer_to_atlas :: proc( ctx : ^Context )
 	}
 }

+// flush_glyph_buffer_to_atlas :: proc( ctx : ^Context )
+// {
+// 	// profile(#procedure)
+// 	// Flush drawcalls to draw list
+// 	if len(ctx.glyph_buffer.clear_draw_list.calls) > 0 {
+// 		merge_draw_list( & ctx.draw_list, & ctx.glyph_buffer.clear_draw_list)
+// 		clear_draw_list( & ctx.glyph_buffer.clear_draw_list)
+// 	}
+
+// 	if len(ctx.glyph_buffer.draw_list.calls) > 0 {
+// 		merge_draw_list( & ctx.draw_list, & ctx.glyph_buffer.draw_list)
+// 		clear_draw_list( & ctx.glyph_buffer.draw_list)
+// 	}
+
+// 	// Clear glyph_update_FBO
+// 	if ctx.glyph_buffer.batch_x != 0
+// 	{
+// 			call := DrawCall {
+// 				pass              = .Glyph,
+// 				start_index       = 0,
+// 				end_index         = 0,
+// 				clear_before_draw = true,
+// 			}
+// 			append( & ctx.draw_list.calls, call)
+// 			ctx.glyph_buffer.batch_x = 0
+// 	}
+// }
+
 // ve_fontcache_merge_drawlist
 merge_draw_list :: proc( dst, src : ^DrawList )
 {
@@ -677,39 +716,34 @@ merge_draw_list :: proc( dst, src : ^DrawList )
 	}
 }

-optimize_draw_list :: proc( draw_list : ^DrawList, call_offset : int )
-{
+optimize_draw_list :: proc(draw_list: ^DrawList, call_offset: int) {
 	// profile(#procedure)
 	assert(draw_list != nil)

-	write_index : int = call_offset
-	for index : int = 1 + call_offset; index < len(draw_list.calls); index += 1
-	{
-		assert( write_index <= index )
-		draw_0 := & draw_list.calls[ write_index ]
-		draw_1 := & draw_list.calls[ index ]
-
-		merge : b32 = true
-		if draw_0.pass      != draw_1.pass        do merge = false
-		if draw_0.end_index != draw_1.start_index do merge = false
-		if draw_0.region    != draw_1.region      do merge = false
-		if draw_1.clear_before_draw               do merge = false
-		if draw_0.colour    != draw_1.colour      do merge = false
-
-		if merge
-		{
-			// logf("merging %v : %v %v", draw_0.pass, write_index, index )
-			draw_0.end_index   = draw_1.end_index
-			draw_1.start_index = 0
-			draw_1.end_index   = 0
+	can_merge_draw_calls :: #force_inline proc "contextless" ( a, b : ^DrawCall ) -> bool {
+		result := \
+		a.pass      == b.pass        &&
+		a.end_index == b.start_index &&
+		a.region    == b.region      &&
+		a.colour    == b.colour      &&
+		! b.clear_before_draw
+		return result
 	}
-		else
+
+	write_index := call_offset
+	for read_index := call_offset + 1; read_index < len(draw_list.calls); read_index += 1
 	{
-			// logf("can't merge %v : %v %v", draw_0.pass, write_index, index )
+		draw_current := & draw_list.calls[write_index]
+		draw_next    := & draw_list.calls[read_index]
+
+		if can_merge_draw_calls(draw_current, draw_next) {
+			draw_current.end_index = draw_next.end_index
+		}
+		else {
+			// Move to the next write position and copy the draw call
 			write_index += 1
-			if write_index != index {
-				draw_2 := & draw_list.calls[ write_index ]
-				draw_2^ = draw_1^
+			if write_index != read_index {
+				draw_list.calls[write_index] = (draw_next^)
 			}
 		}
 	}
--- a/mappings.odin
+++ b/mappings.odin
@@ -23,10 +23,10 @@ import "core:mem"
 	Arena           :: mem.Arena
 	arena_allocator :: mem.arena_allocator
 	arena_init      :: mem.arena_init
-// import "codebase:grime"
-	// log                :: grime.log
-	// logf               :: grime.logf
-	// profile            :: grime.profile
+import "codebase:grime"
+	log                :: grime.log
+	logf               :: grime.logf
+	profile            :: grime.profile

 //#region("Proc overload mappings")

--- a/misc.odin
+++ b/misc.odin
@@ -1,7 +1,10 @@
 package VEFontCache

 import "base:runtime"
-import core_log "core:log"
+import "core:simd"
+import "core:math"
+
+// import core_log "core:log"

 Colour  :: [4]f32
 Vec2    :: [2]f32
@@ -17,23 +20,23 @@ vec2i_from_vec2   :: #force_inline proc "contextless" ( v2     : Vec2  ) -> Vec2

 // This buffer is used below excluisvely to prevent any allocator recusion when verbose logging from allocators.
 // This means a single line is limited to 32k buffer (increase naturally if this SOMEHOW becomes a bottleneck...)
-Logger_Allocator_Buffer : [32 * Kilobyte]u8
+// Logger_Allocator_Buffer : [32 * Kilobyte]u8

-log :: proc( msg : string, level := core_log.Level.Info, loc := #caller_location ) {
-	temp_arena : Arena; arena_init(& temp_arena, Logger_Allocator_Buffer[:])
-	context.allocator      = arena_allocator(& temp_arena)
-	context.temp_allocator = arena_allocator(& temp_arena)
+// log :: proc( msg : string, level := core_log.Level.Info, loc := #caller_location ) {
+// 	temp_arena : Arena; arena_init(& temp_arena, Logger_Allocator_Buffer[:])
+// 	context.allocator      = arena_allocator(& temp_arena)
+// 	context.temp_allocator = arena_allocator(& temp_arena)

-	core_log.log( level, msg, location = loc )
-}
+// 	core_log.log( level, msg, location = loc )
+// }

-logf :: proc( fmt : string, args : ..any,  level := core_log.Level.Info, loc := #caller_location  ) {
-	temp_arena : Arena; arena_init(& temp_arena, Logger_Allocator_Buffer[:])
-	context.allocator      = arena_allocator(& temp_arena)
-	context.temp_allocator = arena_allocator(& temp_arena)
+// logf :: proc( fmt : string, args : ..any,  level := core_log.Level.Info, loc := #caller_location  ) {
+// 	temp_arena : Arena; arena_init(& temp_arena, Logger_Allocator_Buffer[:])
+// 	context.allocator      = arena_allocator(& temp_arena)
+// 	context.temp_allocator = arena_allocator(& temp_arena)

-	core_log.logf( level, fmt, ..args, location = loc )
-}
+// 	core_log.logf( level, fmt, ..args, location = loc )
+// }

 reload_array :: proc( self : ^[dynamic]$Type, allocator : Allocator ) {
 	raw          := transmute( ^runtime.Raw_Dynamic_Array) self
@@ -50,14 +53,77 @@ font_glyph_lru_code :: #force_inline proc "contextless" ( font : FontID, glyph_i
 	return
 }

-shape_lru_hash :: #force_inline proc "contextless" ( label : string ) -> u64 {
-	hash : u64
-	for str_byte in transmute([]byte) label {
-		hash = ((hash << 8) + hash) + u64(str_byte)
-	}
-	return hash
+is_empty :: #force_inline proc ( ctx : ^Context, entry : ^Entry, glyph_index : Glyph ) -> b32
+{
+	if glyph_index == 0 do return true
+	if parser_is_glyph_empty( & entry.parser_info, glyph_index ) do return true
+	return false
 }

+mark_batch_codepoint_seen :: #force_inline proc ( ctx : ^Context, lru_code : u64 ) {
+	ctx.temp_codepoint_seen[lru_code] = true
+	ctx.temp_codepoint_seen_num += 1
+}
+
+reset_batch_codepoint_state :: #force_inline proc( ctx : ^Context ) {
+	clear_map( & ctx.temp_codepoint_seen )
+	ctx.temp_codepoint_seen_num = 0
+}
+
+screenspace_x_form :: #force_inline proc "contextless" ( position, scale : ^Vec2, size : Vec2 )
+{
+	if true
+	{
+		pos_64   := vec2_64_from_vec2(position^)
+		scale_64 := vec2_64_from_vec2(scale^)
+
+		quotient : Vec2_64 = 1.0 / vec2_64(size)
+		pos_64      = pos_64   * quotient * 2.0 - 1.0
+		scale_64    = scale_64 * quotient * 2.0
+
+		(position^) = { f32(pos_64.x), f32(pos_64.y) }
+		(scale^)    = { f32(scale_64.x), f32(scale_64.y) }
+	}
+	else
+	{
+		pos      := position^
+		scale_32 := scale^
+
+		quotient : Vec2 = 1.0 / size
+		pos       = pos   * quotient * 2.0 - 1.0
+		scale_32  = scale_32 * quotient * 2.0
+
+		(position^) = pos
+		(scale^)    = scale_32
+	}
+}
+
+textspace_x_form :: #force_inline proc "contextless" ( position, scale : ^Vec2, size : Vec2 )
+{
+	if true
+	{
+		pos_64   := vec2_64_from_vec2(position^)
+		scale_64 := vec2_64_from_vec2(scale^)
+
+		quotient : Vec2_64 = 1.0 / vec2_64(size)
+		pos_64   *= quotient
+		scale_64 *= quotient
+
+		(position^) = { f32(pos_64.x), f32(pos_64.y) }
+		(scale^)    = { f32(scale_64.x), f32(scale_64.y) }
+	}
+	else
+	{
+		quotient : Vec2 = 1.0 / size
+		(position^) *= quotient
+		(scale^)    *= quotient
+	}
+}
+
+Use_SIMD_For_Bezier_Ops :: true
+
+when ! Use_SIMD_For_Bezier_Ops
+{
 	// For a provided alpha value,
 	// allows the function to calculate the position of a point along the curve at any given fraction of its total length
 	// ve_fontcache_eval_bezier (quadratic)
@@ -104,62 +170,119 @@ eval_point_on_bezier4 :: #force_inline proc "contextless" ( p0, p1, p2, p3 : Vec
 		point := start_point + control_a + control_b + end_point
 		return { f32(point.x), f32(point.y) }
 	}
-
-is_empty :: #force_inline proc ( ctx : ^Context, entry : ^Entry, glyph_index : Glyph ) -> b32
-{
-	if glyph_index == 0 do return true
-	if parser_is_glyph_empty( & entry.parser_info, glyph_index ) do return true
-	return false
-}
-
-mark_batch_codepoint_seen :: #force_inline proc ( ctx : ^Context, lru_code : u64 ) {
-	ctx.temp_codepoint_seen[lru_code] = true
-	ctx.temp_codepoint_seen_num += 1
-}
-
-reset_batch_codepoint_state :: #force_inline proc( ctx : ^Context ) {
-	clear_map( & ctx.temp_codepoint_seen )
-	ctx.temp_codepoint_seen_num = 0
-}
-
-screenspace_x_form :: #force_inline proc "contextless" ( position, scale : ^Vec2, size : Vec2 ) {
-	when true
-	{
-		pos_64   := vec2_64_from_vec2(position^)
-		scale_64 := vec2_64_from_vec2(scale^)
-
-		quotient : Vec2_64 = 1.0 / vec2_64(size)
-		pos_64      = pos_64   * quotient * 2.0 - 1.0
-		scale_64    = scale_64 * quotient * 2.0
-
-		(position^) = { f32(pos_64.x), f32(pos_64.y) }
-		(scale^)    = { f32(scale_64.x), f32(scale_64.y) }
 }
 else
 {
-		quotient : Vec2 = 1.0 / size
-		(position^) *= quotient * 2.0 - 1.0
-		(scale^)    *= quotient * 2.0
-	}
+	Vec2_SIMD :: simd.f32x4
+
+	vec2_to_simd :: #force_inline proc "contextless" (v: Vec2) -> Vec2_SIMD {
+		return Vec2_SIMD{v.x, v.y, 0, 0}
 	}

-textspace_x_form :: #force_inline proc "contextless" ( position, scale : ^Vec2, size : Vec2 ) {
-	when true
+	simd_to_vec2 :: #force_inline proc "contextless" (v: Vec2_SIMD) -> Vec2 {
+		return Vec2{ simd.extract(v, 0), simd.extract(v, 1) }
+	}
+
+	vec2_add_simd :: #force_inline proc "contextless" (a, b: Vec2) -> Vec2 {
+		simd_a := vec2_to_simd(a)
+		simd_b := vec2_to_simd(b)
+		result := simd.add(simd_a, simd_b)
+		return simd_to_vec2(result)
+	}
+
+	vec2_sub_simd :: #force_inline proc "contextless" (a, b: Vec2) -> Vec2 {
+		simd_a := vec2_to_simd(a)
+		simd_b := vec2_to_simd(b)
+		result := simd.sub(simd_a, simd_b)
+		return simd_to_vec2(result)
+	}
+
+	vec2_mul_simd :: #force_inline proc "contextless" (a: Vec2, s: f32) -> Vec2 {
+		simd_a := vec2_to_simd(a)
+		simd_s := Vec2_SIMD{s, s, s, s}
+		result := simd.mul(simd_a, simd_s)
+		return simd_to_vec2(result)
+	}
+
+	vec2_div_simd :: #force_inline proc "contextless" (a: Vec2, s: f32) -> Vec2 {
+		simd_a := vec2_to_simd(a)
+		simd_s := Vec2_SIMD{s, s, s, s}
+		result := simd.div(simd_a, simd_s)
+		return simd_to_vec2(result)
+	}
+
+	vec2_dot_simd :: #force_inline proc "contextless" (a, b: Vec2) -> f32 {
+		simd_a := vec2_to_simd(a)
+		simd_b := vec2_to_simd(b)
+		result := simd.mul(simd_a, simd_b)
+		return simd.reduce_add_ordered(result)
+	}
+
+	vec2_length_sqr_simd :: #force_inline proc "contextless" (a: Vec2) -> f32 {
+		return vec2_dot_simd(a, a)
+	}
+
+	vec2_length_simd :: #force_inline proc "contextless" (a: Vec2) -> f32 {
+		return math.sqrt(vec2_length_sqr_simd(a))
+	}
+
+	vec2_normalize_simd :: #force_inline proc "contextless" (a: Vec2) -> Vec2 {
+		len := vec2_length_simd(a)
+		if len > 0 {
+			inv_len := 1.0 / len
+			return vec2_mul_simd(a, inv_len)
+		}
+		return a
+	}
+
+	// SIMD-optimized version of eval_point_on_bezier3
+	eval_point_on_bezier3 :: #force_inline proc "contextless" (p0, p1, p2: Vec2, alpha: f32) -> Vec2
 	{
-		pos_64   := vec2_64_from_vec2(position^)
-		scale_64 := vec2_64_from_vec2(scale^)
+		simd_p0 := vec2_to_simd(p0)
+		simd_p1 := vec2_to_simd(p1)
+		simd_p2 := vec2_to_simd(p2)

-		quotient : Vec2_64 = 1.0 / vec2_64(size)
-		pos_64   *= quotient
-		scale_64 *= quotient
+		one_minus_alpha := 1.0 - alpha
+		weight_start    := one_minus_alpha * one_minus_alpha
+		weight_control  := 2.0 * one_minus_alpha * alpha
+		weight_end      := alpha * alpha

-		(position^) = { f32(pos_64.x), f32(pos_64.y) }
-		(scale^)    = { f32(scale_64.x), f32(scale_64.y) }
+		simd_weights := Vec2_SIMD{weight_start, weight_control, weight_end, 0}
+		result := simd.add(
+			simd.add(
+				simd.mul( simd_p0, simd.swizzle( simd_weights, 0, 0, 0, 0) ),
+				simd.mul( simd_p1, simd.swizzle( simd_weights, 1, 1, 1, 1) )
+			),
+			simd.mul( simd_p2, simd.swizzle(simd_weights, 2, 2, 2, 2) )
+		)
+
+		return simd_to_vec2(result)
 	}
-	else
+
+	eval_point_on_bezier4 :: #force_inline proc "contextless" (p0, p1, p2, p3: Vec2, alpha: f32) -> Vec2
 	{
-		quotient    : Vec2 = 1.0 / size
-		(position^) *= quotient
-		(scale^)    *= quotient
+		simd_p0 := vec2_to_simd(p0)
+		simd_p1 := vec2_to_simd(p1)
+		simd_p2 := vec2_to_simd(p2)
+		simd_p3 := vec2_to_simd(p3)
+
+		one_minus_alpha := 1.0 - alpha
+		weight_start    := one_minus_alpha * one_minus_alpha * one_minus_alpha
+		weight_c_a      := 3 * one_minus_alpha * one_minus_alpha * alpha
+		weight_c_b      := 3 * one_minus_alpha * alpha * alpha
+		weight_end      := alpha * alpha * alpha
+
+		simd_weights := Vec2_SIMD { weight_start, weight_c_a, weight_c_b, weight_end }
+		result      := simd.add(
+			simd.add(
+				simd.mul( simd_p0, simd.swizzle(simd_weights, 0, 0, 0, 0) ),
+				simd.mul( simd_p1, simd.swizzle(simd_weights, 1, 1, 1, 1) )
+			),
+			simd.add(
+				simd.mul( simd_p2, simd.swizzle(simd_weights, 2, 2, 2, 2) ),
+				simd.mul( simd_p3, simd.swizzle(simd_weights, 3, 3, 3, 3) )
+			)
+		)
+		return simd_to_vec2(result)
 	}
 }
--- a/shaped_text.odin
+++ b/shaped_text.odin
@@ -1,11 +1,10 @@
 package VEFontCache

-import "core:math"
-
 ShapedText :: struct {
 	glyphs         : [dynamic]Glyph,
 	positions      : [dynamic]Vec2,
 	end_cursor_pos : Vec2,
+	size           : Vec2,
 }

 ShapedTextCache :: struct {
@@ -14,36 +13,33 @@ ShapedTextCache :: struct {
 	next_cache_id : i32,
 }

+shape_lru_hash :: #force_inline proc "contextless" ( hash : ^u64, bytes : []byte ) {
+	for value in bytes {
+		(hash^) = (( (hash^) << 8) + (hash^) ) + u64(value)
+	}
+}
+
 shape_text_cached :: proc( ctx : ^Context, font : FontID, text_utf8 : string, entry : ^Entry ) -> ^ShapedText
 {
 	// profile(#procedure)
-	@static buffer : [64 * Kilobyte]byte
-
 	font        := font
-	text_size       := len(text_utf8)
-	sice_end_offset := size_of(FontID) + len(text_utf8)
-
-	buffer_slice := buffer[:]
 	font_bytes  := slice_ptr( transmute(^byte) & font,  size_of(FontID) )
-	copy( buffer_slice, font_bytes )
-
 	text_bytes  := transmute( []byte) text_utf8
-	buffer_slice_post_font := buffer[ size_of(FontID) : sice_end_offset ]
-	copy( buffer_slice_post_font, text_bytes )

-	hash := shape_lru_hash( transmute(string) buffer[: sice_end_offset ] )
+	lru_code : u64
+	shape_lru_hash( & lru_code, font_bytes )
+	shape_lru_hash( & lru_code, text_bytes )

 	shape_cache := & ctx.shape_cache
 	state       := & ctx.shape_cache.state

-	shape_cache_idx := LRU_get( state, hash )
+	shape_cache_idx := LRU_get( state, lru_code )
 	if shape_cache_idx == -1
 	{
 		if shape_cache.next_cache_id < i32(state.capacity) {
 			shape_cache_idx            = shape_cache.next_cache_id
 			shape_cache.next_cache_id += 1
-			evicted := LRU_put( state, hash, shape_cache_idx )
-			assert( evicted == hash )
+			evicted := LRU_put( state, lru_code, shape_cache_idx )
 		}
 		else
 		{
@@ -53,16 +49,16 @@ shape_text_cached :: proc( ctx : ^Context, font : FontID, text_utf8 : string, en
 			shape_cache_idx = LRU_peek( state, next_evict_idx, must_find = true )
 			assert( shape_cache_idx != - 1 )

-			LRU_put( state, hash, shape_cache_idx )
+			LRU_put( state, lru_code, shape_cache_idx )
 		}

-		shape_text_uncached( ctx, font, text_utf8, entry, & shape_cache.storage[ shape_cache_idx ] )
+		shape_entry := & shape_cache.storage[ shape_cache_idx ]
+		shape_text_uncached( ctx, font, text_utf8, entry, shape_entry )
 	}

 	return & shape_cache.storage[ shape_cache_idx ]
 }

-// TODO(Ed): Make position rounding an option
 shape_text_uncached :: proc( ctx : ^Context, font : FontID, text_utf8 : string, entry : ^Entry, output : ^ShapedText )
 {
 	// profile(#procedure)
@@ -74,12 +70,17 @@ shape_text_uncached :: proc( ctx : ^Context, font : FontID, text_utf8 : string,
 	clear( & output.glyphs )
 	clear( & output.positions )

-	ascent, descent, line_gap := parser_get_font_vertical_metrics( & entry.parser_info )
+	ascent_i32, descent_i32, line_gap_i32 := parser_get_font_vertical_metrics( & entry.parser_info )
+	ascent      := f32(ascent_i32)
+	descent     := f32(descent_i32)
+	line_gap    := f32(line_gap_i32)
+	line_height := (ascent - descent + line_gap) * entry.size_scale

 	if use_full_text_shape
 	{
 		// assert( entry.shaper_info != nil )
-		shaper_shape_from_text( & ctx.shaper_ctx, & entry.shaper_info, output, text_utf8, ascent, descent, line_gap, entry.size, entry.size_scale )
+		shaper_shape_from_text( & ctx.shaper_ctx, & entry.shaper_info, output, text_utf8, ascent_i32, descent_i32, line_gap_i32, entry.size, entry.size_scale )
+		// TODO(Ed): Need to be able to provide the text height as well
 		return
 	}
 	else
@@ -87,13 +88,10 @@ shape_text_uncached :: proc( ctx : ^Context, font : FontID, text_utf8 : string,
 		// Note(Original Author):
 		// We use our own fallback dumbass text shaping.
 		// WARNING: PLEASE USE HARFBUZZ. GOOD TEXT SHAPING IS IMPORTANT FOR INTERNATIONALISATION.
-		ascent   := f32(ascent)
-		descent  := f32(descent)
-		line_gap := f32(line_gap)

+		line_count     : int = 1
+		max_line_width : f32 = 0
 		position       : Vec2
-		advance            : i32 = 0
-		to_left_side_glyph : i32 = 0

 		prev_codepoint : rune
 		for codepoint in text_utf8
@@ -104,29 +102,34 @@ shape_text_uncached :: proc( ctx : ^Context, font : FontID, text_utf8 : string,
 			}
 			if codepoint == '\n'
 			{
+				line_count    += 1
+				max_line_width = max(max_line_width, position.x)
 				position.x     = 0.0
-				position.y -= (ascent - descent + line_gap) * entry.size_scale
+				position.y    -= line_height
 				position.y     = ceil(position.y)
 				prev_codepoint = rune(0)
 				continue
 			}
 			if abs( entry.size ) <= Advance_Snap_Smallfont_Size {
-				position.x = math.ceil( position.x )
+				position.x = ceil( position.x )
 			}

 			append( & output.glyphs, parser_find_glyph_index( & entry.parser_info, codepoint ))
-			advance, to_left_side_glyph = parser_get_codepoint_horizontal_metrics( & entry.parser_info, codepoint )
+			advance, _ := parser_get_codepoint_horizontal_metrics( & entry.parser_info, codepoint )

 			append( & output.positions, Vec2 {
 				ceil(position.x),
 				position.y
 			})
-			// append( & output.positions, position )

 			position.x    += f32(advance) * entry.size_scale
 			prev_codepoint = codepoint
 		}

 		output.end_cursor_pos = position
+		max_line_width        = max(max_line_width, position.x)
+
+		output.size.x = max_line_width
+		output.size.y = f32(line_count) * line_height
 	}
 }