Added manual simd but odins already doing it perfectly (+ ohter attempted optimizations)

2024-06-28 07:31:51 -04:00
parent b8665d0bc2
commit a28303bad6
4 changed files with 354 additions and 199 deletions
--- a/code/font/VEFontCache/atlas.odin
+++ b/code/font/VEFontCache/atlas.odin
@@ -86,68 +86,110 @@ atlas_bbox :: proc( atlas : ^Atlas, region : AtlasRegionKind, local_idx : i32 )
 	return
 }

-decide_codepoint_region :: proc( ctx : ^Context, entry : ^Entry, glyph_index : Glyph
+// decide_codepoint_region :: proc( ctx : ^Context, entry : ^Entry, glyph_index : Glyph
+// ) -> (region_kind : AtlasRegionKind, region : ^AtlasRegion, over_sample : Vec2)
+// {
+// 	if parser_is_glyph_empty( & entry.parser_info, glyph_index ) {
+// 		region_kind = .None
+// 	}
+
+// 	bounds_0, bounds_1 := parser_get_glyph_box( & entry.parser_info, glyph_index )
+// 	bounds_width  := f32(bounds_1.x - bounds_0.x)
+// 	bounds_height := f32(bounds_1.y - bounds_0.y)
+
+// 	atlas        := & ctx.atlas
+// 	glyph_buffer := & ctx.glyph_buffer
+
+// 	glyph_padding := f32(atlas.glyph_padding) * 2
+
+// 	bounds_width_scaled  := cast(u32) (bounds_width  * entry.size_scale + glyph_padding)
+// 	bounds_height_scaled := cast(u32) (bounds_height * entry.size_scale + glyph_padding)
+
+// 	if bounds_width_scaled <= atlas.region_a.width && bounds_height_scaled <= atlas.region_a.height
+// 	{
+// 		// Region A for small glyphs. These are good for things such as punctuation.
+// 		region_kind = .A
+// 		region      = & atlas.region_a
+// 	}
+// 	else if bounds_width_scaled <= atlas.region_b.width && bounds_height_scaled <= atlas.region_b.height
+// 	{
+// 		// Region B for tall glyphs. These are good for things such as european alphabets.
+// 		region_kind = .B
+// 		region      = & atlas.region_b
+// 	}
+// 	else if bounds_width_scaled <= atlas.region_c.width && bounds_height_scaled <= atlas.region_c.height
+// 	{
+// 		// Region C for big glyphs. These are good for things such as asian typography.
+// 		region_kind = .C
+// 		region      = & atlas.region_c
+// 	}
+// 	else if bounds_width_scaled <= atlas.region_d.width && bounds_height_scaled <= atlas.region_d.height
+// 	{
+// 		// Region D for huge glyphs. These are good for things such as titles and 4k.
+// 		region_kind = .D
+// 		region      = & atlas.region_d
+// 	}
+// 	else if bounds_width_scaled <= glyph_buffer.width && bounds_height_scaled <= glyph_buffer.height
+// 	{
+// 		// Region 'E' for massive glyphs. These are rendered uncached and un-oversampled.
+// 		region_kind = .E
+// 		region      = nil
+// 		if bounds_width_scaled <= glyph_buffer.width / 2 && bounds_height_scaled <= glyph_buffer.height / 2 {
+// 			over_sample = { 2.0, 2.0 }
+// 		}
+// 		else {
+// 			over_sample = { 1.0, 1.0 }
+// 		}
+// 		return
+// 	}
+// 	else {
+// 		region_kind = .None
+// 		return
+// 	}
+
+// 	over_sample = glyph_buffer.over_sample
+// 	assert(region != nil)
+// 	return
+// }
+
+decide_codepoint_region :: proc(ctx : ^Context, entry : ^Entry, glyph_index : Glyph
 ) -> (region_kind : AtlasRegionKind, region : ^AtlasRegion, over_sample : Vec2)
 {
-	if parser_is_glyph_empty( & entry.parser_info, glyph_index ) {
-		region_kind = .None
+	if parser_is_glyph_empty(&entry.parser_info, glyph_index) {
+		return .None, nil, {}
 	}

-	bounds_0, bounds_1 := parser_get_glyph_box( & entry.parser_info, glyph_index )
-	bounds_width  := f32(bounds_1.x - bounds_0.x)
-	bounds_height := f32(bounds_1.y - bounds_0.y)
+	bounds_0, bounds_1 := parser_get_glyph_box(&entry.parser_info, glyph_index)
+	bounds_width       := f32(bounds_1.x - bounds_0.x)
+	bounds_height      := f32(bounds_1.y - bounds_0.y)

-	atlas        := & ctx.atlas
-	glyph_buffer := & ctx.glyph_buffer
+	atlas         := & ctx.atlas
+	glyph_buffer  := & ctx.glyph_buffer
+	glyph_padding := f32( atlas.glyph_padding ) * 2

-	glyph_padding := f32(atlas.glyph_padding) * 2
+	bounds_width_scaled  := u32(bounds_width  * entry.size_scale + glyph_padding)
+	bounds_height_scaled := u32(bounds_height * entry.size_scale + glyph_padding)

-	bounds_width_scaled  := cast(u32) (bounds_width  * entry.size_scale + glyph_padding)
-	bounds_height_scaled := cast(u32) (bounds_height * entry.size_scale + glyph_padding)
-
-	if bounds_width_scaled <= atlas.region_a.width && bounds_height_scaled <= atlas.region_a.height
-	{
-		// Region A for small glyphs. These are good for things such as punctuation.
-		region_kind = .A
-		region      = & atlas.region_a
-	}
-	else if bounds_width_scaled <= atlas.region_b.width && bounds_height_scaled <= atlas.region_b.height
-	{
-		// Region B for tall glyphs. These are good for things such as european alphabets.
-		region_kind = .B
-		region      = & atlas.region_b
-	}
-	else if bounds_width_scaled <= atlas.region_c.width && bounds_height_scaled <= atlas.region_c.height
-	{
-		// Region C for big glyphs. These are good for things such as asian typography.
-		region_kind = .C
-		region      = & atlas.region_c
-	}
-	else if bounds_width_scaled <= atlas.region_d.width && bounds_height_scaled <= atlas.region_d.height
-	{
-		// Region D for huge glyphs. These are good for things such as titles and 4k.
-		region_kind = .D
-		region      = & atlas.region_d
-	}
-	else if bounds_width_scaled <= glyph_buffer.width && bounds_height_scaled <= glyph_buffer.height
-	{
-		// Region 'E' for massive glyphs. These are rendered uncached and un-oversampled.
-		region_kind = .E
-		region      = nil
-		if bounds_width_scaled <= glyph_buffer.width / 2 && bounds_height_scaled <= glyph_buffer.height / 2 {
-			over_sample = { 2.0, 2.0 }
-		}
-		else {
-			over_sample = { 1.0, 1.0 }
-		}
-		return
-	}
-	else {
-		region_kind = .None
-		return
+	// Use a lookup table for faster region selection
+	region_lookup := [4]struct { kind: AtlasRegionKind, region: ^AtlasRegion } {
+			{ .A, & atlas.region_a },
+			{ .B, & atlas.region_b },
+			{ .C, & atlas.region_c },
+			{ .D, & atlas.region_d },
 	}

-	over_sample = glyph_buffer.over_sample
-	assert(region != nil)
-	return
+	for region in region_lookup do if bounds_width_scaled <= region.region.width && bounds_height_scaled <= region.region.height {
+		return region.kind, region.region, glyph_buffer.over_sample
+	}
+
+	if bounds_width_scaled  <= glyph_buffer.width \
+	&& bounds_height_scaled <= glyph_buffer.height {
+		over_sample = \
+			bounds_width_scaled  <= glyph_buffer.width  / 2 &&
+			bounds_height_scaled <= glyph_buffer.height / 2 ? \
+			  {2.0, 2.0} \
+			: {1.0, 1.0}
+		return .E, nil, over_sample
+	}
+	return .None, nil, {}
 }
--- a/code/font/VEFontCache/draw.odin
+++ b/code/font/VEFontCache/draw.odin
@@ -56,23 +56,23 @@ blit_quad :: proc( draw_list : ^DrawList, p0 : Vec2 = {0, 0}, p1 : Vec2 = {1, 1}
 		// p0.x, p0.y, p1.x, p1.y, uv0.x, uv0.y, uv1.x, uv1.y);
 	v_offset := cast(u32) len(draw_list.vertices)

-	quadv : [4]Vertex
-
-	quadv[0] = Vertex {
-		{p0.x, p0.y},
-		uv0.x, uv0.y
-	}
-	quadv[1] = Vertex {
-		{p0.x, p1.y},
-		uv0.x, uv1.y
-	}
-	quadv[2] = Vertex {
-		{p1.x, p0.y},
-		uv1.x, uv0.y
-	}
-	quadv[3] = Vertex {
-		{p1.x, p1.y},
-		uv1.x, uv1.y
+	quadv : [4]Vertex = {
+		{
+			{p0.x, p0.y},
+			uv0.x, uv0.y
+		},
+		{
+			{p0.x, p1.y},
+			uv0.x, uv1.y
+		},
+		{
+			{p1.x, p0.y},
+			uv1.x, uv0.y
+		},
+		{
+			{p1.x, p1.y},
+			uv1.x, uv1.y
+		}
 	}
 	append( & draw_list.vertices, ..quadv[:] )

@@ -84,118 +84,81 @@ blit_quad :: proc( draw_list : ^DrawList, p0 : Vec2 = {0, 0}, p1 : Vec2 = {1, 1}
 	return
 }

-cache_glyph :: proc( ctx : ^Context, font : FontID, glyph_index : Glyph, entry : ^Entry, bounds_0, bounds_1 : Vec2, scale, translate : Vec2  ) -> b32
+cache_glyph :: proc(ctx : ^Context, font : FontID, glyph_index : Glyph, entry : ^Entry, bounds_0, bounds_1 : Vec2, scale, translate : Vec2) -> b32
 {
 	// profile(#procedure)
 	if glyph_index == Glyph(0) {
-		// Note(Original Author): Glyph not in current hb_font
 		return false
 	}

-	// Retrieve the shape definition from the parser.
-	shape, error := parser_get_glyph_shape( & entry.parser_info, glyph_index )
-	assert( error == .None )
+	shape, error := parser_get_glyph_shape(&entry.parser_info, glyph_index)
+	assert(error == .None)
 	if len(shape) == 0 {
 		return false
 	}

-	if ctx.debug_print_verbose
-	{
-		log( "shape:")
-		for vertex in shape
-		{
-			if vertex.type == .Move {
-				logf("move_to %d %d", vertex.x, vertex.y )
-			}
-			else if vertex.type == .Line {
-				logf("line_to %d %d", vertex.x, vertex.y )
-			}
-			else if vertex.type == .Curve {
-				logf("curve_to %d %d through %d %d", vertex.x, vertex.y, vertex.contour_x0, vertex.contour_y0 )
-			}
-			else if vertex.type == .Cubic {
-				logf("cubic_to %d %d through %d %d and %d %d",
-					vertex.x, vertex.y,
-					vertex.contour_x0, vertex.contour_y0,
-					vertex.contour_x1, vertex.contour_y1 )
-			}
-		}
-	}
+	outside := Vec2{bounds_0.x - 21, bounds_0.y - 33}

-	/*
-	Note(Original Author):
-	We need a random point that is outside our shape. We simply pick something diagonally across from top-left bound corner.
-	Note that this outside point is scaled alongside the glyph in ve_fontcache_draw_filled_path, so we don't need to handle that here.
-	*/
-	outside := Vec2 {
-		bounds_0.x - 21,
-		bounds_0.y - 33,
-	}
-
-	// Note(Original Author): Figure out scaling so it fits within our box.
-	draw := DrawCall_Default
+	draw            := DrawCall_Default
 	draw.pass        = FrameBufferPass.Glyph
 	draw.start_index = u32(len(ctx.draw_list.indices))

-	// Note(Original Author);
-	// Draw the path using simplified version of https://medium.com/@evanwallace/easy-scalable-text-rendering-on-the-gpu-c3f4d782c5ac.
-	// Instead of involving fragment shader code we simply make use of modern GPU ability to crunch triangles and brute force curve definitions.
-	path := & ctx.temp_path
-	clear( path)
-	for edge in shape	do switch edge.type
-	{
+	path := &ctx.temp_path
+	clear(path)
+
+	append_bezier_curve :: #force_inline proc(path: ^[dynamic]Vertex, p0, p1, p2: Vec2, quality: u32) {
+		step := 1.0 / f32(quality)
+		for index := u32(1); index <= quality; index += 1 {
+			alpha := f32(index) * step
+			append( path, Vertex { pos = eval_point_on_bezier3(p0, p1, p2, alpha) } )
+		}
+	}
+
+	append_bezier_curve_cubic :: #force_inline proc(path: ^[dynamic]Vertex, p0, p1, p2, p3: Vec2, quality: u32) {
+		step := 1.0 / f32(quality)
+		for index := u32(1); index <= quality; index += 1 {
+			alpha := f32(index) * step
+			append( path, Vertex { pos = eval_point_on_bezier4(p0, p1, p2, p3, alpha) } )
+		}
+	}
+
+	for edge in shape do #partial switch edge.type {
 		case .Move:
 			if len(path) > 0 {
-				draw_filled_path( & ctx.draw_list, outside, path[:], scale, translate, ctx.debug_print_verbose )
+					draw_filled_path(&ctx.draw_list, outside, path[:], scale, translate, ctx.debug_print_verbose)
+					clear(path)
 			}
-			clear( path)
 			fallthrough

 		case .Line:
-			vertex := Vertex { pos = Vec2{ f32(edge.x), f32(edge.y) } }
-			append( path, vertex)
+			append( path, Vertex { pos = Vec2 { f32(edge.x), f32(edge.y)} } )

 		case .Curve:
-			assert( len(path) > 0 )
-			p0 := path[ len(path) - 1 ].pos
+			assert(len(path) > 0)
+			p0 := path[ len(path) - 1].pos
 			p1 := Vec2{ f32(edge.contour_x0), f32(edge.contour_y0) }
 			p2 := Vec2{ f32(edge.x), f32(edge.y) }
-
-			step  := 1.0 / f32(ctx.curve_quality)
-			alpha := step
-			for index := i32(0); index < i32(ctx.curve_quality); index += 1 {
-				append( path, Vertex { pos = eval_point_on_bezier3( p0, p1, p2, alpha ) })
-				alpha += step
-			}
+			append_bezier_curve( path, p0, p1, p2, ctx.curve_quality )

 		case .Cubic:
-			assert( len(path) > 0 )
+			assert( len(path) > 0)
 			p0 := path[ len(path) - 1].pos
 			p1 := Vec2{ f32(edge.contour_x0), f32(edge.contour_y0) }
 			p2 := Vec2{ f32(edge.contour_x1), f32(edge.contour_y1) }
 			p3 := Vec2{ f32(edge.x), f32(edge.y) }
-
-			step  := 1.0 / f32(ctx.curve_quality)
-			alpha := step
-			for index := i32(0); index < i32(ctx.curve_quality); index += 1 {
-				append( path, Vertex { pos = eval_point_on_bezier4( p0, p1, p2, p3, alpha ) })
-				alpha += step
-			}
-
-		case .None:
-			assert(false, "Unknown edge type or invalid")
+			append_bezier_curve_cubic( path, p0, p1, p2, p3, ctx.curve_quality )
 	}
+
 	if len(path) > 0 {
-		draw_filled_path( & ctx.draw_list, outside, path[:], scale, translate, ctx.debug_print_verbose )
+		draw_filled_path(&ctx.draw_list, outside, path[:], scale, translate, ctx.debug_print_verbose)
 	}

-	// Note(Original Author): Apend the draw call
-	draw.end_index = cast(u32) len(ctx.draw_list.indices)
+	draw.end_index = u32(len(ctx.draw_list.indices))
 	if draw.end_index > draw.start_index {
-		append( & ctx.draw_list.calls, draw)
+		append(&ctx.draw_list.calls, draw)
 	}

-	parser_free_shape( & entry.parser_info, shape )
+	parser_free_shape(&entry.parser_info, shape)
 	return true
 }

@@ -698,6 +661,34 @@ flush_glyph_buffer_to_atlas :: proc( ctx : ^Context )
 	}
 }

+// flush_glyph_buffer_to_atlas :: proc( ctx : ^Context )
+// {
+// 	// profile(#procedure)
+// 	// Flush drawcalls to draw list
+// 	if len(ctx.glyph_buffer.clear_draw_list.calls) > 0 {
+// 		merge_draw_list( & ctx.draw_list, & ctx.glyph_buffer.clear_draw_list)
+// 		clear_draw_list( & ctx.glyph_buffer.clear_draw_list)
+// 	}
+
+// 	if len(ctx.glyph_buffer.draw_list.calls) > 0 {
+// 		merge_draw_list( & ctx.draw_list, & ctx.glyph_buffer.draw_list)
+// 		clear_draw_list( & ctx.glyph_buffer.draw_list)
+// 	}
+
+// 	// Clear glyph_update_FBO
+// 	if ctx.glyph_buffer.batch_x != 0
+// 	{
+// 			call := DrawCall {
+// 				pass              = .Glyph,
+// 				start_index       = 0,
+// 				end_index         = 0,
+// 				clear_before_draw = true,
+// 			}
+// 			append( & ctx.draw_list.calls, call)
+// 			ctx.glyph_buffer.batch_x = 0
+// 	}
+// }
+
 // ve_fontcache_merge_drawlist
 merge_draw_list :: proc( dst, src : ^DrawList )
 {
--- a/code/font/VEFontCache/misc.odin
+++ b/code/font/VEFontCache/misc.odin
@@ -1,6 +1,9 @@
 package VEFontCache

 import "base:runtime"
+import "core:simd"
+import "core:math"
+
 // import core_log "core:log"

 Colour  :: [4]f32
@@ -50,54 +53,6 @@ font_glyph_lru_code :: #force_inline proc "contextless" ( font : FontID, glyph_i
 	return
 }

-
-// For a provided alpha value,
-// allows the function to calculate the position of a point along the curve at any given fraction of its total length
-// ve_fontcache_eval_bezier (quadratic)
-eval_point_on_bezier3 :: #force_inline proc "contextless" ( p0, p1, p2 : Vec2, alpha : f32 ) -> Vec2
-{
-	p0    := vec2_64(p0)
-	p1    := vec2_64(p1)
-	p2    := vec2_64(p2)
-	alpha := f64(alpha)
-
-	weight_start   := (1 - alpha) * (1 - alpha)
-	weight_control := 2.0 * (1 - alpha) * alpha
-	weight_end     := alpha * alpha
-
-	starting_point := p0 * weight_start
-	control_point  := p1 * weight_control
-	end_point      := p2 * weight_end
-
-	point := starting_point + control_point + end_point
-	return { f32(point.x), f32(point.y) }
-}
-
-// For a provided alpha value,
-// allows the function to calculate the position of a point along the curve at any given fraction of its total length
-// ve_fontcache_eval_bezier (cubic)
-eval_point_on_bezier4 :: #force_inline proc "contextless" ( p0, p1, p2, p3 : Vec2, alpha : f32 ) -> Vec2
-{
-	p0    := vec2_64(p0)
-	p1    := vec2_64(p1)
-	p2    := vec2_64(p2)
-	p3    := vec2_64(p3)
-	alpha := f64(alpha)
-
-	weight_start := (1 - alpha) * (1 - alpha) * (1 - alpha)
-	weight_c_a   := 3 * (1 - alpha) * (1 - alpha) * alpha
-	weight_c_b   := 3 * (1 - alpha) * alpha * alpha
-	weight_end   := alpha * alpha * alpha
-
-	start_point := p0 * weight_start
-	control_a   := p1 * weight_c_a
-	control_b   := p2 * weight_c_b
-	end_point   := p3 * weight_end
-
-	point := start_point + control_a + control_b + end_point
-	return { f32(point.x), f32(point.y) }
-}
-
 is_empty :: #force_inline proc ( ctx : ^Context, entry : ^Entry, glyph_index : Glyph ) -> b32
 {
 	if glyph_index == 0 do return true
@@ -115,7 +70,8 @@ reset_batch_codepoint_state :: #force_inline proc( ctx : ^Context ) {
 	ctx.temp_codepoint_seen_num = 0
 }

-screenspace_x_form :: #force_inline proc "contextless" ( position, scale : ^Vec2, size : Vec2 ) {
+screenspace_x_form :: #force_inline proc "contextless" ( position, scale : ^Vec2, size : Vec2 )
+{
 	if true
 	{
 		pos_64   := vec2_64_from_vec2(position^)
@@ -142,7 +98,8 @@ screenspace_x_form :: #force_inline proc "contextless" ( position, scale : ^Vec2
 	}
 }

-textspace_x_form :: #force_inline proc "contextless" ( position, scale : ^Vec2, size : Vec2 ) {
+textspace_x_form :: #force_inline proc "contextless" ( position, scale : ^Vec2, size : Vec2 )
+{
 	if true
 	{
 		pos_64   := vec2_64_from_vec2(position^)
@@ -162,3 +119,170 @@ textspace_x_form :: #force_inline proc "contextless" ( position, scale : ^Vec2,
 		(scale^)    *= quotient
 	}
 }
+
+Use_SIMD_For_Bezier_Ops :: true
+
+when ! Use_SIMD_For_Bezier_Ops
+{
+	// For a provided alpha value,
+	// allows the function to calculate the position of a point along the curve at any given fraction of its total length
+	// ve_fontcache_eval_bezier (quadratic)
+	eval_point_on_bezier3 :: #force_inline proc "contextless" ( p0, p1, p2 : Vec2, alpha : f32 ) -> Vec2
+	{
+		p0    := vec2_64(p0)
+		p1    := vec2_64(p1)
+		p2    := vec2_64(p2)
+		alpha := f64(alpha)
+
+		weight_start   := (1 - alpha) * (1 - alpha)
+		weight_control := 2.0 * (1 - alpha) * alpha
+		weight_end     := alpha * alpha
+
+		starting_point := p0 * weight_start
+		control_point  := p1 * weight_control
+		end_point      := p2 * weight_end
+
+		point := starting_point + control_point + end_point
+		return { f32(point.x), f32(point.y) }
+	}
+
+	// For a provided alpha value,
+	// allows the function to calculate the position of a point along the curve at any given fraction of its total length
+	// ve_fontcache_eval_bezier (cubic)
+	eval_point_on_bezier4 :: #force_inline proc "contextless" ( p0, p1, p2, p3 : Vec2, alpha : f32 ) -> Vec2
+	{
+		p0    := vec2_64(p0)
+		p1    := vec2_64(p1)
+		p2    := vec2_64(p2)
+		p3    := vec2_64(p3)
+		alpha := f64(alpha)
+
+		weight_start := (1 - alpha) * (1 - alpha) * (1 - alpha)
+		weight_c_a   := 3 * (1 - alpha) * (1 - alpha) * alpha
+		weight_c_b   := 3 * (1 - alpha) * alpha * alpha
+		weight_end   := alpha * alpha * alpha
+
+		start_point := p0 * weight_start
+		control_a   := p1 * weight_c_a
+		control_b   := p2 * weight_c_b
+		end_point   := p3 * weight_end
+
+		point := start_point + control_a + control_b + end_point
+		return { f32(point.x), f32(point.y) }
+	}
+}
+else
+{
+	Vec2_SIMD :: simd.f32x4
+
+	vec2_to_simd :: #force_inline proc "contextless" (v: Vec2) -> Vec2_SIMD {
+		return Vec2_SIMD{v.x, v.y, 0, 0}
+	}
+
+	simd_to_vec2 :: #force_inline proc "contextless" (v: Vec2_SIMD) -> Vec2 {
+		return Vec2{simd.extract(v, 0), simd.extract(v, 1)}
+	}
+
+	vec2_add_simd :: #force_inline proc "contextless" (a, b: Vec2) -> Vec2 {
+		simd_a := vec2_to_simd(a)
+		simd_b := vec2_to_simd(b)
+		result := simd.add(simd_a, simd_b)
+		return simd_to_vec2(result)
+	}
+
+	vec2_sub_simd :: #force_inline proc "contextless" (a, b: Vec2) -> Vec2 {
+		simd_a := vec2_to_simd(a)
+		simd_b := vec2_to_simd(b)
+		result := simd.sub(simd_a, simd_b)
+		return simd_to_vec2(result)
+	}
+
+	vec2_mul_simd :: #force_inline proc "contextless" (a: Vec2, s: f32) -> Vec2 {
+		simd_a := vec2_to_simd(a)
+		simd_s := Vec2_SIMD{s, s, s, s}
+		result := simd.mul(simd_a, simd_s)
+		return simd_to_vec2(result)
+	}
+
+	vec2_div_simd :: #force_inline proc "contextless" (a: Vec2, s: f32) -> Vec2 {
+		simd_a := vec2_to_simd(a)
+		simd_s := Vec2_SIMD{s, s, s, s}
+		result := simd.div(simd_a, simd_s)
+		return simd_to_vec2(result)
+	}
+
+	vec2_dot_simd :: #force_inline proc "contextless" (a, b: Vec2) -> f32 {
+		simd_a := vec2_to_simd(a)
+		simd_b := vec2_to_simd(b)
+		result := simd.mul(simd_a, simd_b)
+		return simd.reduce_add_ordered(result)
+	}
+
+	vec2_length_sqr_simd :: #force_inline proc "contextless" (a: Vec2) -> f32 {
+		return vec2_dot_simd(a, a)
+	}
+
+	vec2_length_simd :: #force_inline proc "contextless" (a: Vec2) -> f32 {
+		return math.sqrt(vec2_length_sqr_simd(a))
+	}
+
+	vec2_normalize_simd :: #force_inline proc "contextless" (a: Vec2) -> Vec2 {
+		len := vec2_length_simd(a)
+		if len > 0 {
+			inv_len := 1.0 / len
+			return vec2_mul_simd(a, inv_len)
+		}
+		return a
+	}
+
+	// SIMD-optimized version of eval_point_on_bezier3
+	eval_point_on_bezier3 :: #force_inline proc "contextless" (p0, p1, p2: Vec2, alpha: f32) -> Vec2
+	{
+		simd_p0 := vec2_to_simd(p0)
+		simd_p1 := vec2_to_simd(p1)
+		simd_p2 := vec2_to_simd(p2)
+
+		one_minus_alpha := 1.0 - alpha
+		weight_start    := one_minus_alpha * one_minus_alpha
+		weight_control  := 2.0 * one_minus_alpha * alpha
+		weight_end      := alpha * alpha
+
+		simd_weights := Vec2_SIMD{weight_start, weight_control, weight_end, 0}
+		result := simd.add(
+			simd.add(
+				simd.mul( simd_p0, simd.swizzle( simd_weights, 0, 0, 0, 0) ),
+				simd.mul( simd_p1, simd.swizzle( simd_weights, 1, 1, 1, 1) )
+			),
+			simd.mul( simd_p2, simd.swizzle(simd_weights, 2, 2, 2, 2) )
+		)
+
+		return simd_to_vec2(result)
+	}
+
+	eval_point_on_bezier4 :: #force_inline proc "contextless" (p0, p1, p2, p3: Vec2, alpha: f32) -> Vec2
+	{
+		simd_p0 := vec2_to_simd(p0)
+		simd_p1 := vec2_to_simd(p1)
+		simd_p2 := vec2_to_simd(p2)
+		simd_p3 := vec2_to_simd(p3)
+
+		one_minus_alpha := 1.0 - alpha
+		weight_start    := one_minus_alpha * one_minus_alpha * one_minus_alpha
+		weight_c_a      := 3 * one_minus_alpha * one_minus_alpha * alpha
+		weight_c_b      := 3 * one_minus_alpha * alpha * alpha
+		weight_end      := alpha * alpha * alpha
+
+		simd_weights := Vec2_SIMD { weight_start, weight_c_a, weight_c_b, weight_end }
+		result      := simd.add(
+			simd.add(
+				simd.mul( simd_p0, simd.swizzle(simd_weights, 0, 0, 0, 0) ),
+				simd.mul( simd_p1, simd.swizzle(simd_weights, 1, 1, 1, 1) )
+			),
+			simd.add(
+				simd.mul( simd_p2, simd.swizzle(simd_weights, 2, 2, 2, 2) ),
+				simd.mul( simd_p3, simd.swizzle(simd_weights, 3, 3, 3, 3) )
+			)
+		)
+		return simd_to_vec2(result)
+	}
+}
--- a/code/font/VEFontCache/shaped_text.odin
+++ b/code/font/VEFontCache/shaped_text.odin
@@ -40,7 +40,6 @@ shape_text_cached :: proc( ctx : ^Context, font : FontID, text_utf8 : string, en
 			shape_cache_idx            = shape_cache.next_cache_id
 			shape_cache.next_cache_id += 1
 			evicted := LRU_put( state, lru_code, shape_cache_idx )
-			assert( evicted == lru_code )
 		}
 		else
 		{
@@ -54,7 +53,6 @@ shape_text_cached :: proc( ctx : ^Context, font : FontID, text_utf8 : string, en
 		}

 		shape_entry := & shape_cache.storage[ shape_cache_idx ]
-		// shape_entry.storage_hash = lru_code
 		shape_text_uncached( ctx, font, text_utf8, entry, shape_entry )
 	}