From 8deeb40e5de1d33b571f0b1faf7b8dea678cd91b Mon Sep 17 00:00:00 2001
From: Feoramund <161657516+Feoramund@users.noreply.github.com>
Date: Sun, 4 Aug 2024 15:47:00 -0400
Subject: [PATCH 01/13] Add vectorized `index_byte` and `last_index_byte`

---
 core/simd/util/util.odin | 188 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 188 insertions(+)
 create mode 100644 core/simd/util/util.odin

diff --git a/core/simd/util/util.odin b/core/simd/util/util.odin
new file mode 100644
index 000000000..ac523b42a
--- /dev/null
+++ b/core/simd/util/util.odin
@@ -0,0 +1,188 @@
+/*
+	(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
+	Made available under Odin's BSD-3 license.
+
+	List of contributors:
+		Feoramund: `index_byte` procedures.
+*/
+
+// package simd_util implements compositions of SIMD operations for optimizing
+// the core library where available.
+
+//+build i386, amd64
+package simd_util
+
+import "base:intrinsics"
+import "core:simd/x86"
+
+@private SCAN_REGISTER_SIZE :: 16
+@private SCAN_REGISTERS     :: 4
+@private SCAN_WIDTH         :: SCAN_REGISTERS * SCAN_REGISTER_SIZE
+
+// How long should a string be before using any of the `index_*` procedures in
+// this package.
+RECOMMENDED_SCAN_SIZE :: SCAN_REGISTER_SIZE
+
+/*
+Scan a slice of bytes for a specific byte.
+
+This procedure safely handles padding out slices of any length, including empty
+slices.
+
+Inputs:
+- data: A slice of bytes.
+- c: The byte to search for.
+
+Returns:
+- index: The index of the byte `c`, or -1 if it was not found.
+*/
+@(enable_target_feature="sse2")
+index_byte :: proc(data: []u8, c: byte) -> (index: int) #no_bounds_check {
+	scanner_data: [SCAN_REGISTER_SIZE]u8 = c
+	scanner := intrinsics.unaligned_load(cast(^x86.__m128i)&scanner_data[0])
+
+	i: int
+	length := len(data)
+	full_chunks_length := length - length % SCAN_WIDTH
+
+	for /**/; i < full_chunks_length; i += SCAN_WIDTH {
+		simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&data[i])
+
+		#unroll for j in 0..<SCAN_REGISTERS {
+			cmp := x86._mm_cmpeq_epi8(simd_load[j], scanner)
+			mask := x86._mm_movemask_epi8(cmp)
+
+			// NOTE(Feoramund): I experimented with ORing all the masks onto a
+			// 128-bit integer before performing the `mask != 0` check to see
+			// if that might be faster. However, the cost to avoid 3
+			// compares resulted in a marginally slower runtime on my machine.
+			//
+			// Simpler won out here.
+			if mask != 0 {
+				ctz := intrinsics.count_trailing_zeros(mask)
+				return i + j * SCAN_REGISTER_SIZE + cast(int)ctz
+			}
+		}
+	}
+
+	if i < length {
+		// The data is not exactly divisible by SCAN_WIDTH, and we haven't found
+		// what we're looking for yet, so we must pad out the end, then run our
+		// algorithm on it.
+		padded_data_end: [SCAN_WIDTH]u8 = ---
+		remnant_length := length % SCAN_WIDTH
+		intrinsics.mem_copy_non_overlapping(
+			&padded_data_end[0],
+			&raw_data(data)[full_chunks_length],
+			remnant_length,
+		)
+
+		simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&padded_data_end[0])
+
+		#unroll for j in 0..<SCAN_REGISTERS {
+			cmp := x86._mm_cmpeq_epi8(simd_load[j], scanner)
+			mask := x86._mm_movemask_epi8(cmp)
+
+			// Because this data is padded out, it's possible that we could
+			// match on uninitialized memory, so we must guard against that.
+
+			// Create a relevancy mask: (Example)
+			//
+			//    max(u64)        = 0xFFFF_FFFF_FFFF_FFFF
+			//
+			//  Convert an integer into a stream of on-bits by using the
+			//  shifted negation of the maximum. The subtraction selects which
+			//  section of the overall mask we should apply.
+			//
+			//                   << 17 - (1 * SCAN_REGISTER_SIZE)
+			//                    = 0xFFFF_FFFF_FFFF_FFFE
+			//
+			submask := max(u64) << u64(remnant_length - (j * SCAN_REGISTER_SIZE))
+			//
+			//    ~submask        = 0x0000_0000_0000_0001
+			//    (submask >> 63) = 0x0000_0000_0000_0001
+			//
+			//  The multiplication is a guard against zero.
+			//
+			submask = ~submask * (submask >> 63)
+			//
+			//  Finally, mask out any irrelevant bits with the submask.
+			mask &= i32(submask)
+
+			if mask != 0 {
+				ctz := int(intrinsics.count_trailing_zeros(mask))
+				return i + j * SCAN_REGISTER_SIZE + ctz
+			}
+		}
+	}
+
+	return -1
+}
+
+/*
+Scan a slice of bytes for a specific byte, starting from the end and working
+backwards to the start.
+
+This procedure safely handles padding out slices of any length, including empty
+slices.
+
+Inputs:
+- data: A slice of bytes.
+- c: The byte to search for.
+
+Returns:
+- index: The index of the byte `c`, or -1 if it was not found.
+*/
+@(enable_target_feature="sse2")
+last_index_byte :: proc(data: []u8, c: byte) -> int #no_bounds_check {
+	scanner_data: [SCAN_REGISTER_SIZE]u8 = c
+	scanner := intrinsics.unaligned_load(cast(^x86.__m128i)&scanner_data[0])
+
+	i := len(data) - SCAN_WIDTH
+
+	for /**/; i >= 0; i -= SCAN_WIDTH {
+		simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&data[i])
+
+		// There is no #reverse #unroll at the time of this writing, so we use
+		// `j` to count down by subtraction.
+		#unroll for j in 1..=SCAN_REGISTERS {
+			cmp := x86._mm_cmpeq_epi8(simd_load[SCAN_REGISTERS-j], scanner)
+			mask := x86._mm_movemask_epi8(cmp)
+
+			if mask != 0 {
+				// CLZ is used instead to get the on-bit from the other end.
+				clz := (8 * size_of(mask) - 1) - int(intrinsics.count_leading_zeros(mask))
+				return i + SCAN_WIDTH - j * SCAN_REGISTER_SIZE + clz
+			}
+		}
+	}
+
+	if i < 0 {
+		padded_data_end: [SCAN_WIDTH]u8 = ---
+		remnant_length := len(data) % SCAN_WIDTH
+		intrinsics.mem_copy_non_overlapping(
+			&padded_data_end[0],
+			&raw_data(data)[0],
+			remnant_length,
+		)
+
+		simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&padded_data_end[0])
+
+		#unroll for j in 1..=SCAN_REGISTERS {
+			cmp := x86._mm_cmpeq_epi8(simd_load[SCAN_REGISTERS-j], scanner)
+			mask := x86._mm_movemask_epi8(cmp)
+
+			submask := max(u64) << u64(remnant_length - (SCAN_REGISTERS-j) * SCAN_REGISTER_SIZE)
+			submask = ~submask * (submask >> 63)
+
+			mask &= i32(submask)
+
+			if mask != 0 {
+				clz := (8 * size_of(mask) - 1) - int(intrinsics.count_leading_zeros(mask))
+				return SCAN_WIDTH - j * SCAN_REGISTER_SIZE + clz
+			}
+		}
+	}
+
+	return -1
+}

From f66fcd9acb390b199452a125ed09899dffefde5d Mon Sep 17 00:00:00 2001
From: Feoramund <161657516+Feoramund@users.noreply.github.com>
Date: Sun, 4 Aug 2024 15:58:56 -0400
Subject: [PATCH 02/13] Use vectorized `index_*` procs in `core`

---
 core/bytes/bytes.odin     | 47 ++++++++++++++++++++++++++++++++-------
 core/strings/strings.odin | 47 ++++++++++++++++++++++++++++++++-------
 2 files changed, 78 insertions(+), 16 deletions(-)

diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin
index 7cbf092ac..dcd4931e2 100644
--- a/core/bytes/bytes.odin
+++ b/core/bytes/bytes.odin
@@ -1,6 +1,8 @@
 package bytes
 
+import "base:intrinsics"
 import "core:mem"
+@require import simd_util "core:simd/util"
 import "core:unicode"
 import "core:unicode/utf8"
 
@@ -295,22 +297,51 @@ split_after_iterator :: proc(s: ^[]byte, sep: []byte) -> ([]byte, bool) {
 
 
 index_byte :: proc(s: []byte, c: byte) -> int {
-	for i := 0; i < len(s); i += 1 {
-		if s[i] == c {
-			return i
+	_index_byte :: #force_inline proc(s: []byte, c: byte) -> int {
+		for i := 0; i < len(s); i += 1 {
+			if s[i] == c {
+				return i
+			}
 		}
+		return -1
+	}
+
+	// NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a
+	// significant speedup when compiling in either Size or Speed mode.
+	// The SIMD version is usually 2-3x slower without optimizations on.
+	when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") {
+		// SIMD's benefits are noticeable only past a certain threshold of data.
+		// For small data, use the plain old algorithm.
+		if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE {
+			return simd_util.index_byte(s, c)
+		} else {
+			return _index_byte(s, c)
+		}
+	} else {
+		return _index_byte(s, c)
 	}
-	return -1
 }
 
 // Returns -1 if c is not present
 last_index_byte :: proc(s: []byte, c: byte) -> int {
-	for i := len(s)-1; i >= 0; i -= 1 {
-		if s[i] == c {
-			return i
+	_last_index_byte :: #force_inline proc(s: []byte, c: byte) -> int {
+		for i := len(s)-1; i >= 0; i -= 1 {
+			if s[i] == c {
+				return i
+			}
 		}
+		return -1
+	}
+
+	when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") {
+		if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE {
+			return simd_util.last_index_byte(s, c)
+		} else {
+			return _last_index_byte(s, c)
+		}
+	} else {
+		return _last_index_byte(s, c)
 	}
-	return -1
 }
 
 
diff --git a/core/strings/strings.odin b/core/strings/strings.odin
index e9b50bab0..9d3e88165 100644
--- a/core/strings/strings.odin
+++ b/core/strings/strings.odin
@@ -1,7 +1,9 @@
 // Procedures to manipulate UTF-8 encoded strings
 package strings
 
+import "base:intrinsics"
 import "core:io"
+@require import simd_util "core:simd/util"
 import "core:mem"
 import "core:unicode"
 import "core:unicode/utf8"
@@ -1424,12 +1426,29 @@ Output:
 
 */
 index_byte :: proc(s: string, c: byte) -> (res: int) {
-	for i := 0; i < len(s); i += 1 {
-		if s[i] == c {
-			return i
+	_index_byte :: #force_inline proc(s: string, c: byte) -> int {
+		for i := 0; i < len(s); i += 1 {
+			if s[i] == c {
+				return i
+			}
 		}
+		return -1
+	}
+
+	// NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a
+	// significant speedup when compiling in either Size or Speed mode.
+	// The SIMD version is usually 2-3x slower without optimizations on.
+	when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") {
+		// SIMD's benefits are noticeable only past a certain threshold of data.
+		// For small data, use the plain old algorithm.
+		if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE {
+			return simd_util.index_byte(transmute([]u8)s, c)
+		} else {
+			return _index_byte(s, c)
+		}
+	} else {
+		return _index_byte(s, c)
 	}
-	return -1
 }
 /*
 Returns the byte offset of the last byte `c` in the string `s`, -1 when not found.
@@ -1464,12 +1483,24 @@ Output:
 
 */
 last_index_byte :: proc(s: string, c: byte) -> (res: int) {
-	for i := len(s)-1; i >= 0; i -= 1 {
-		if s[i] == c {
-			return i
+	_last_index_byte :: #force_inline proc(s: string, c: byte) -> int {
+		for i := len(s)-1; i >= 0; i -= 1 {
+			if s[i] == c {
+				return i
+			}
 		}
+		return -1
+	}
+
+	when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") {
+		if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE {
+			return simd_util.last_index_byte(transmute([]u8)s, c)
+		} else {
+			return _last_index_byte(s, c)
+		}
+	} else {
+		return _last_index_byte(s, c)
 	}
-	return -1
 }
 /*
 Returns the byte offset of the first rune `r` in the string `s` it finds, -1 when not found.

From 28c98c2e7af8b1ce49ead18398fc7793b00e0df3 Mon Sep 17 00:00:00 2001
From: Feoramund <161657516+Feoramund@users.noreply.github.com>
Date: Sun, 4 Aug 2024 16:07:19 -0400
Subject: [PATCH 03/13] Add tests for vectorized `index_*` procs

---
 tests/core/normal.odin                        |   1 +
 tests/core/simd/util/test_core_simd_util.odin | 108 ++++++++++++++++++
 2 files changed, 109 insertions(+)
 create mode 100644 tests/core/simd/util/test_core_simd_util.odin

diff --git a/tests/core/normal.odin b/tests/core/normal.odin
index 8cd3b3917..a1b948fea 100644
--- a/tests/core/normal.odin
+++ b/tests/core/normal.odin
@@ -34,6 +34,7 @@ download_assets :: proc() {
 @(require) import "path/filepath"
 @(require) import "reflect"
 @(require) import "runtime"
+@(require) import "simd/util"
 @(require) import "slice"
 @(require) import "strconv"
 @(require) import "strings"
diff --git a/tests/core/simd/util/test_core_simd_util.odin b/tests/core/simd/util/test_core_simd_util.odin
new file mode 100644
index 000000000..65bf566c0
--- /dev/null
+++ b/tests/core/simd/util/test_core_simd_util.odin
@@ -0,0 +1,108 @@
+//+build i386, amd64
+package test_core_simd_util
+
+import simd_util "core:simd/util"
+import "core:testing"
+
+@test
+test_index_byte_sanity :: proc(t: ^testing.T) {
+	// We must be able to find the byte at the correct index.
+	for n in 1..<256 {
+		data := make([]u8, n)
+		defer delete(data)
+		for i in 0..<n-1 {
+			data[i] = '-'
+		}
+
+		// Find it at the end.
+		data[n-1] = 'o'
+		if !testing.expect_value(t, simd_util.index_byte(data, 'o'), n-1) {
+			return
+		}
+		if !testing.expect_value(t, simd_util.last_index_byte(data, 'o'), n-1) {
+			return
+		}
+		data[n-1] = '-'
+
+		// Find it in the middle.
+		data[n/2] = 'o'
+		if !testing.expect_value(t, simd_util.index_byte(data, 'o'), n/2) {
+			return
+		}
+		if !testing.expect_value(t, simd_util.last_index_byte(data, 'o'), n/2) {
+			return
+		}
+		data[n/2] = '-'
+
+		// Find it at the start.
+		data[0] = 'o'
+		if !testing.expect_value(t, simd_util.index_byte(data, 'o'), 0) {
+			return
+		}
+		if !testing.expect_value(t, simd_util.last_index_byte(data, 'o'), 0) {
+			return
+		}
+	}
+}
+
+@test
+test_index_byte_empty :: proc(t: ^testing.T) {
+	a: [1]u8
+	testing.expect_value(t, simd_util.index_byte(a[0:0], 'o'), -1)
+	testing.expect_value(t, simd_util.last_index_byte(a[0:0], 'o'), -1)
+}
+
+@test
+test_index_byte_multiple_hits :: proc(t: ^testing.T) {
+	for n in 5..<256 {
+		data := make([]u8, n)
+		defer delete(data)
+		for i in 0..<n-1 {
+			data[i] = '-'
+		}
+
+		data[n-1] = 'o'
+		data[n-3] = 'o'
+		data[n-5] = 'o'
+
+		// Find the first one.
+		if !testing.expect_value(t, simd_util.index_byte(data, 'o'), n-5) {
+			return
+		}
+
+		// Find the last one.
+		if !testing.expect_value(t, simd_util.last_index_byte(data, 'o'), n-1) {
+			return
+		}
+	}
+}
+
+@test
+test_index_byte_zero :: proc(t: ^testing.T) {
+	// This test protects against false positives in uninitialized memory.
+	for n in 1..<256 {
+		data := make([]u8, n + 64)
+		defer delete(data)
+		for i in 0..<n-1 {
+			data[i] = '-'
+		}
+
+		// Positive hit.
+		data[n-1] = 0
+		if !testing.expect_value(t, simd_util.index_byte(data[:n], 0), n-1) {
+			return
+		}
+		if !testing.expect_value(t, simd_util.last_index_byte(data[:n], 0), n-1) {
+			return
+		}
+
+		// Test for false positives.
+		data[n-1] = '-'
+		if !testing.expect_value(t, simd_util.index_byte(data[:n], 0), -1) {
+			return
+		}
+		if !testing.expect_value(t, simd_util.last_index_byte(data[:n], 0), -1) {
+			return
+		}
+	}
+}

From 0418d27bdf21b2384c1c76caeea116ac28b5d426 Mon Sep 17 00:00:00 2001
From: Feoramund <161657516+Feoramund@users.noreply.github.com>
Date: Sun, 4 Aug 2024 16:13:00 -0400
Subject: [PATCH 04/13] Add benchmarks for vectorized `index_*` procs

---
 tests/benchmark/all.odin                      |   1 +
 .../simd/util/benchmark_simd_util.odin        | 117 ++++++++++++++++++
 2 files changed, 118 insertions(+)
 create mode 100644 tests/benchmark/simd/util/benchmark_simd_util.odin

diff --git a/tests/benchmark/all.odin b/tests/benchmark/all.odin
index d1b7662e2..357d86f67 100644
--- a/tests/benchmark/all.odin
+++ b/tests/benchmark/all.odin
@@ -2,3 +2,4 @@ package benchmarks
 
 @(require) import "crypto"
 @(require) import "hash"
+@(require) import "simd/util"
diff --git a/tests/benchmark/simd/util/benchmark_simd_util.odin b/tests/benchmark/simd/util/benchmark_simd_util.odin
new file mode 100644
index 000000000..4538c6612
--- /dev/null
+++ b/tests/benchmark/simd/util/benchmark_simd_util.odin
@@ -0,0 +1,117 @@
+//+build i386, amd64
+package benchmark_simd_util
+
+import "core:fmt"
+import "core:log"
+import simd_util "core:simd/util"
+import "core:testing"
+import "core:time"
+
+
+// These are the normal, unoptimized algorithms.
+
+plain_index_byte :: proc(s: []u8, c: byte) -> (res: int) #no_bounds_check {
+	for i := 0; i < len(s); i += 1 {
+		if s[i] == c {
+			return i
+		}
+	}
+	return -1
+}
+
+plain_last_index_byte :: proc(s: []u8, c: byte) -> (res: int) #no_bounds_check {
+	for i := len(s)-1; i >= 0; i -= 1 {
+		if s[i] == c {
+			return i
+		}
+	}
+	return -1
+}
+
+sizes := [?]int {
+	15, 16, 17,
+	31, 32, 33,
+	256,
+	512,
+	1024,
+	1024 * 1024,
+	1024 * 1024 * 1024,
+}
+
+run_trial_size :: proc(p: proc([]u8, byte) -> int, size: int, idx: int, warmup: int, runs: int) -> (timing: time.Duration) {
+	data := make([]u8, size)
+	defer delete(data)
+
+	for i in 0..<size {
+		data[i] = u8('0' + i % 10)
+	}
+	data[idx] = 'z'
+
+	accumulator: int
+
+	for _ in 0..<warmup {
+		accumulator += p(data, 'z')
+	}
+
+	for _ in 0..<runs {
+		start := time.now()
+		accumulator += p(data, 'z')
+		done := time.since(start)
+		timing += done
+	}
+
+	timing /= time.Duration(runs)
+
+	log.debug(accumulator)
+	return
+}
+
+HOT :: 3
+
+@test
+benchmark_plain_index_cold :: proc(t: ^testing.T) {
+	report: string
+	for size in sizes {
+		timing := run_trial_size(plain_index_byte, size, size - 1, 0, 1)
+		report = fmt.tprintf("%s\n        +++ % 8M | %v", report, size, timing)
+		timing = run_trial_size(plain_last_index_byte, size, 0, 0, 1)
+		report = fmt.tprintf("%s\n (last) +++ % 8M | %v", report, size, timing)
+	}
+	log.info(report)
+}
+
+@test
+benchmark_plain_index_hot :: proc(t: ^testing.T) {
+	report: string
+	for size in sizes {
+		timing := run_trial_size(plain_index_byte, size, size - 1, HOT, HOT)
+		report = fmt.tprintf("%s\n        +++ % 8M | %v", report, size, timing)
+		timing = run_trial_size(plain_last_index_byte, size, 0, HOT, HOT)
+		report = fmt.tprintf("%s\n (last) +++ % 8M | %v", report, size, timing)
+	}
+	log.info(report)
+}
+
+@test
+benchmark_simd_index_cold :: proc(t: ^testing.T) {
+	report: string
+	for size in sizes {
+		timing := run_trial_size(simd_util.index_byte, size, size - 1, 0, 1)
+		report = fmt.tprintf("%s\n        +++ % 8M | %v", report, size, timing)
+		timing = run_trial_size(simd_util.last_index_byte, size, 0, 0, 1)
+		report = fmt.tprintf("%s\n (last) +++ % 8M | %v", report, size, timing)
+	}
+	log.info(report)
+}
+
+@test
+benchmark_simd_index_hot :: proc(t: ^testing.T) {
+	report: string
+	for size in sizes {
+		timing := run_trial_size(simd_util.index_byte, size, size - 1, HOT, HOT)
+		report = fmt.tprintf("%s\n        +++ % 8M | %v", report, size, timing)
+		timing = run_trial_size(simd_util.last_index_byte, size, 0, HOT, HOT)
+		report = fmt.tprintf("%s\n (last) +++ % 8M | %v", report, size, timing)
+	}
+	log.info(report)
+}

From 793811b219e77b21a1c765323957e4b74ce13e64 Mon Sep 17 00:00:00 2001
From: Feoramund <161657516+Feoramund@users.noreply.github.com>
Date: Sun, 4 Aug 2024 16:14:17 -0400
Subject: [PATCH 05/13] Add `simd_util` to `examples/all`

---
 examples/all/all_main.odin | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/all/all_main.odin b/examples/all/all_main.odin
index d92a6b8c4..43ea0de98 100644
--- a/examples/all/all_main.odin
+++ b/examples/all/all_main.odin
@@ -115,6 +115,7 @@ import relative         "core:relative"
 import reflect          "core:reflect"
 import runtime          "base:runtime"
 import simd             "core:simd"
+import simd_util        "core:simd/util"
 import x86              "core:simd/x86"
 import slice            "core:slice"
 import slice_heap       "core:slice/heap"
@@ -237,6 +238,7 @@ _ :: relative
 _ :: reflect
 _ :: runtime
 _ :: simd
+_ :: simd_util
 _ :: x86
 _ :: slice
 _ :: slice_heap

From 12dd0cb72a586a99129280c78697089caab0500a Mon Sep 17 00:00:00 2001
From: Feoramund <161657516+Feoramund@users.noreply.github.com>
Date: Fri, 9 Aug 2024 17:39:19 -0400
Subject: [PATCH 06/13] Simplify and make `simd_util` cross-platform

This new algorithm uses a Scalar->Vector->Scalar iteration loop which
requires no masking off of any incomplete data chunks.

Also, the width was reduced to 32 bytes instead of 64, as I found this
to be about as fast as the previous 64-byte x86 version.
---
 core/bytes/bytes.odin                         |  18 +-
 core/simd/util/util.odin                      | 214 ++++++++----------
 core/strings/strings.odin                     |  18 +-
 .../simd/util/benchmark_simd_util.odin        |   1 -
 tests/core/simd/util/test_core_simd_util.odin |   1 -
 5 files changed, 101 insertions(+), 151 deletions(-)

diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin
index dcd4931e2..136c98f6b 100644
--- a/core/bytes/bytes.odin
+++ b/core/bytes/bytes.odin
@@ -309,14 +309,8 @@ index_byte :: proc(s: []byte, c: byte) -> int {
 	// NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a
 	// significant speedup when compiling in either Size or Speed mode.
 	// The SIMD version is usually 2-3x slower without optimizations on.
-	when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") {
-		// SIMD's benefits are noticeable only past a certain threshold of data.
-		// For small data, use the plain old algorithm.
-		if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE {
-			return simd_util.index_byte(s, c)
-		} else {
-			return _index_byte(s, c)
-		}
+	when ODIN_OPTIMIZATION_MODE > .Minimal {
+		return #force_inline simd_util.index_byte(s, c)
 	} else {
 		return _index_byte(s, c)
 	}
@@ -333,12 +327,8 @@ last_index_byte :: proc(s: []byte, c: byte) -> int {
 		return -1
 	}
 
-	when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") {
-		if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE {
-			return simd_util.last_index_byte(s, c)
-		} else {
-			return _last_index_byte(s, c)
-		}
+	when ODIN_OPTIMIZATION_MODE > .Minimal {
+		return #force_inline simd_util.last_index_byte(s, c)
 	} else {
 		return _last_index_byte(s, c)
 	}
diff --git a/core/simd/util/util.odin b/core/simd/util/util.odin
index ac523b42a..b209a44ea 100644
--- a/core/simd/util/util.odin
+++ b/core/simd/util/util.odin
@@ -8,26 +8,24 @@
 
 // package simd_util implements compositions of SIMD operations for optimizing
 // the core library where available.
-
-//+build i386, amd64
 package simd_util
 
 import "base:intrinsics"
-import "core:simd/x86"
 
-@private SCAN_REGISTER_SIZE :: 16
-@private SCAN_REGISTERS     :: 4
-@private SCAN_WIDTH         :: SCAN_REGISTERS * SCAN_REGISTER_SIZE
+@private SCAN_WIDTH :: 32
 
-// How long should a string be before using any of the `index_*` procedures in
-// this package.
-RECOMMENDED_SCAN_SIZE :: SCAN_REGISTER_SIZE
+@(private, rodata)
+simd_scanner_indices := #simd[SCAN_WIDTH]u8 {
+	 0,  1,  2,  3,  4,  5,  6,  7,
+	 8,  9, 10, 11, 12, 13, 14, 15,
+	16, 17, 18, 19, 20, 21, 22, 23,
+	24, 25, 26, 27, 28, 29, 30, 31,
+}
 
 /*
 Scan a slice of bytes for a specific byte.
 
-This procedure safely handles padding out slices of any length, including empty
-slices.
+This procedure safely handles slices of any length, including empty slices.
 
 Inputs:
 - data: A slice of bytes.
@@ -36,83 +34,54 @@ Inputs:
 Returns:
 - index: The index of the byte `c`, or -1 if it was not found.
 */
-@(enable_target_feature="sse2")
 index_byte :: proc(data: []u8, c: byte) -> (index: int) #no_bounds_check {
-	scanner_data: [SCAN_REGISTER_SIZE]u8 = c
-	scanner := intrinsics.unaligned_load(cast(^x86.__m128i)&scanner_data[0])
-
-	i: int
 	length := len(data)
-	full_chunks_length := length - length % SCAN_WIDTH
+	i := 0
 
-	for /**/; i < full_chunks_length; i += SCAN_WIDTH {
-		simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&data[i])
-
-		#unroll for j in 0..<SCAN_REGISTERS {
-			cmp := x86._mm_cmpeq_epi8(simd_load[j], scanner)
-			mask := x86._mm_movemask_epi8(cmp)
-
-			// NOTE(Feoramund): I experimented with ORing all the masks onto a
-			// 128-bit integer before performing the `mask != 0` check to see
-			// if that might be faster. However, the cost to avoid 3
-			// compares resulted in a marginally slower runtime on my machine.
-			//
-			// Simpler won out here.
-			if mask != 0 {
-				ctz := intrinsics.count_trailing_zeros(mask)
-				return i + j * SCAN_REGISTER_SIZE + cast(int)ctz
+	// Guard against small strings.
+	if length < SCAN_WIDTH {
+		for /**/; i < length; i += 1 {
+			if data[i] == c {
+				return i
 			}
 		}
+		return -1
+	}
+
+	ptr := cast(int)cast(uintptr)raw_data(data)
+
+	alignment_start := (SCAN_WIDTH - ptr % SCAN_WIDTH) % SCAN_WIDTH
+
+	// Iterate as a scalar until the data is aligned on a `SCAN_WIDTH` boundary.
+	//
+	// This way, every load in the vector loop will be aligned, which should be
+	// the fastest possible scenario.
+	for /**/; i < alignment_start; i += 1 {
+		if data[i] == c {
+			return i
+		}
 	}
 
-	if i < length {
-		// The data is not exactly divisible by SCAN_WIDTH, and we haven't found
-		// what we're looking for yet, so we must pad out the end, then run our
-		// algorithm on it.
-		padded_data_end: [SCAN_WIDTH]u8 = ---
-		remnant_length := length % SCAN_WIDTH
-		intrinsics.mem_copy_non_overlapping(
-			&padded_data_end[0],
-			&raw_data(data)[full_chunks_length],
-			remnant_length,
-		)
+	// Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level.
+	scanner: #simd[SCAN_WIDTH]u8 = c
+	tail := length - (length - alignment_start) % SCAN_WIDTH
 
-		simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&padded_data_end[0])
-
-		#unroll for j in 0..<SCAN_REGISTERS {
-			cmp := x86._mm_cmpeq_epi8(simd_load[j], scanner)
-			mask := x86._mm_movemask_epi8(cmp)
-
-			// Because this data is padded out, it's possible that we could
-			// match on uninitialized memory, so we must guard against that.
-
-			// Create a relevancy mask: (Example)
-			//
-			//    max(u64)        = 0xFFFF_FFFF_FFFF_FFFF
-			//
-			//  Convert an integer into a stream of on-bits by using the
-			//  shifted negation of the maximum. The subtraction selects which
-			//  section of the overall mask we should apply.
-			//
-			//                   << 17 - (1 * SCAN_REGISTER_SIZE)
-			//                    = 0xFFFF_FFFF_FFFF_FFFE
-			//
-			submask := max(u64) << u64(remnant_length - (j * SCAN_REGISTER_SIZE))
-			//
-			//    ~submask        = 0x0000_0000_0000_0001
-			//    (submask >> 63) = 0x0000_0000_0000_0001
-			//
-			//  The multiplication is a guard against zero.
-			//
-			submask = ~submask * (submask >> 63)
-			//
-			//  Finally, mask out any irrelevant bits with the submask.
-			mask &= i32(submask)
-
-			if mask != 0 {
-				ctz := int(intrinsics.count_trailing_zeros(mask))
-				return i + j * SCAN_REGISTER_SIZE + ctz
-			}
+	for /**/; i < tail; i += SCAN_WIDTH {
+		load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^
+		comparison := intrinsics.simd_lanes_eq(load, scanner)
+		match := intrinsics.simd_reduce_or(comparison)
+		if match > 0 {
+			sentinel: #simd[SCAN_WIDTH]u8 = u8(0xFF)
+			index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel)
+			index_reduce := intrinsics.simd_reduce_min(index_select)
+			return i + cast(int)index_reduce
+		}
+	}
+	
+	// Iterate as a scalar over the remaining unaligned portion.
+	for /**/; i < length; i += 1 {
+		if data[i] == c {
+			return i
 		}
 	}
 
@@ -123,8 +92,7 @@ index_byte :: proc(data: []u8, c: byte) -> (index: int) #no_bounds_check {
 Scan a slice of bytes for a specific byte, starting from the end and working
 backwards to the start.
 
-This procedure safely handles padding out slices of any length, including empty
-slices.
+This procedure safely handles slices of any length, including empty slices.
 
 Inputs:
 - data: A slice of bytes.
@@ -133,54 +101,58 @@ Inputs:
 Returns:
 - index: The index of the byte `c`, or -1 if it was not found.
 */
-@(enable_target_feature="sse2")
 last_index_byte :: proc(data: []u8, c: byte) -> int #no_bounds_check {
-	scanner_data: [SCAN_REGISTER_SIZE]u8 = c
-	scanner := intrinsics.unaligned_load(cast(^x86.__m128i)&scanner_data[0])
+	length := len(data)
+	i := length - 1
 
-	i := len(data) - SCAN_WIDTH
-
-	for /**/; i >= 0; i -= SCAN_WIDTH {
-		simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&data[i])
-
-		// There is no #reverse #unroll at the time of this writing, so we use
-		// `j` to count down by subtraction.
-		#unroll for j in 1..=SCAN_REGISTERS {
-			cmp := x86._mm_cmpeq_epi8(simd_load[SCAN_REGISTERS-j], scanner)
-			mask := x86._mm_movemask_epi8(cmp)
-
-			if mask != 0 {
-				// CLZ is used instead to get the on-bit from the other end.
-				clz := (8 * size_of(mask) - 1) - int(intrinsics.count_leading_zeros(mask))
-				return i + SCAN_WIDTH - j * SCAN_REGISTER_SIZE + clz
+	// Guard against small strings.
+	if length < SCAN_WIDTH {
+		for /**/; i >= 0; i -= 1 {
+			if data[i] == c {
+				return i
 			}
 		}
+		return -1
+	}
+
+	ptr := cast(int)cast(uintptr)raw_data(data)
+
+	tail := length - (ptr + length) % SCAN_WIDTH
+
+	// Iterate as a scalar until the data is aligned on a `SCAN_WIDTH` boundary.
+	//
+	// This way, every load in the vector loop will be aligned, which should be
+	// the fastest possible scenario.
+	for /**/; i >= tail; i -= 1 {
+		if data[i] == c {
+			return i
+		}
 	}
 
-	if i < 0 {
-		padded_data_end: [SCAN_WIDTH]u8 = ---
-		remnant_length := len(data) % SCAN_WIDTH
-		intrinsics.mem_copy_non_overlapping(
-			&padded_data_end[0],
-			&raw_data(data)[0],
-			remnant_length,
-		)
+	// Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level.
+	scanner: #simd[SCAN_WIDTH]u8 = c
+	alignment_start := (SCAN_WIDTH - ptr % SCAN_WIDTH) % SCAN_WIDTH
 
-		simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&padded_data_end[0])
+	i -= SCAN_WIDTH - 1
 
-		#unroll for j in 1..=SCAN_REGISTERS {
-			cmp := x86._mm_cmpeq_epi8(simd_load[SCAN_REGISTERS-j], scanner)
-			mask := x86._mm_movemask_epi8(cmp)
+	for /**/; i >= alignment_start; i -= SCAN_WIDTH {
+		load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^
+		comparison := intrinsics.simd_lanes_eq(load, scanner)
+		match := intrinsics.simd_reduce_or(comparison)
+		if match > 0 {
+			sentinel: #simd[SCAN_WIDTH]u8
+			index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel)
+			index_reduce := intrinsics.simd_reduce_max(index_select)
+			return i + cast(int)index_reduce
+		}
+	}
 
-			submask := max(u64) << u64(remnant_length - (SCAN_REGISTERS-j) * SCAN_REGISTER_SIZE)
-			submask = ~submask * (submask >> 63)
-
-			mask &= i32(submask)
-
-			if mask != 0 {
-				clz := (8 * size_of(mask) - 1) - int(intrinsics.count_leading_zeros(mask))
-				return SCAN_WIDTH - j * SCAN_REGISTER_SIZE + clz
-			}
+	// Iterate as a scalar over the remaining unaligned portion.
+	i += SCAN_WIDTH - 1
+	
+	for /**/; i >= 0; i -= 1 {
+		if data[i] == c {
+			return i
 		}
 	}
 
diff --git a/core/strings/strings.odin b/core/strings/strings.odin
index 9d3e88165..b8e43f90d 100644
--- a/core/strings/strings.odin
+++ b/core/strings/strings.odin
@@ -1438,14 +1438,8 @@ index_byte :: proc(s: string, c: byte) -> (res: int) {
 	// NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a
 	// significant speedup when compiling in either Size or Speed mode.
 	// The SIMD version is usually 2-3x slower without optimizations on.
-	when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") {
-		// SIMD's benefits are noticeable only past a certain threshold of data.
-		// For small data, use the plain old algorithm.
-		if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE {
-			return simd_util.index_byte(transmute([]u8)s, c)
-		} else {
-			return _index_byte(s, c)
-		}
+	when ODIN_OPTIMIZATION_MODE > .Minimal {
+		return #force_inline simd_util.index_byte(transmute([]u8)s, c)
 	} else {
 		return _index_byte(s, c)
 	}
@@ -1492,12 +1486,8 @@ last_index_byte :: proc(s: string, c: byte) -> (res: int) {
 		return -1
 	}
 
-	when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") {
-		if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE {
-			return simd_util.last_index_byte(transmute([]u8)s, c)
-		} else {
-			return _last_index_byte(s, c)
-		}
+	when ODIN_OPTIMIZATION_MODE > .Minimal {
+		return #force_inline simd_util.last_index_byte(transmute([]u8)s, c)
 	} else {
 		return _last_index_byte(s, c)
 	}
diff --git a/tests/benchmark/simd/util/benchmark_simd_util.odin b/tests/benchmark/simd/util/benchmark_simd_util.odin
index 4538c6612..18fa0a9e3 100644
--- a/tests/benchmark/simd/util/benchmark_simd_util.odin
+++ b/tests/benchmark/simd/util/benchmark_simd_util.odin
@@ -1,4 +1,3 @@
-//+build i386, amd64
 package benchmark_simd_util
 
 import "core:fmt"
diff --git a/tests/core/simd/util/test_core_simd_util.odin b/tests/core/simd/util/test_core_simd_util.odin
index 65bf566c0..ff7e1f9aa 100644
--- a/tests/core/simd/util/test_core_simd_util.odin
+++ b/tests/core/simd/util/test_core_simd_util.odin
@@ -1,4 +1,3 @@
-//+build i386, amd64
 package test_core_simd_util
 
 import simd_util "core:simd/util"

From c8a62ee4ec9b7beec6dcff907ad2dfecdd547f22 Mon Sep 17 00:00:00 2001
From: Feoramund <161657516+Feoramund@users.noreply.github.com>
Date: Fri, 9 Aug 2024 17:42:14 -0400
Subject: [PATCH 07/13] Make `simd_util` index procs `contextless` where
 applicable

---
 core/bytes/bytes.odin                              | 4 ++--
 core/simd/util/util.odin                           | 4 ++--
 core/strings/strings.odin                          | 4 ++--
 tests/benchmark/simd/util/benchmark_simd_util.odin | 6 +++---
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin
index 136c98f6b..4edd089b9 100644
--- a/core/bytes/bytes.odin
+++ b/core/bytes/bytes.odin
@@ -297,7 +297,7 @@ split_after_iterator :: proc(s: ^[]byte, sep: []byte) -> ([]byte, bool) {
 
 
 index_byte :: proc(s: []byte, c: byte) -> int {
-	_index_byte :: #force_inline proc(s: []byte, c: byte) -> int {
+	_index_byte :: #force_inline proc "contextless" (s: []byte, c: byte) -> int {
 		for i := 0; i < len(s); i += 1 {
 			if s[i] == c {
 				return i
@@ -318,7 +318,7 @@ index_byte :: proc(s: []byte, c: byte) -> int {
 
 // Returns -1 if c is not present
 last_index_byte :: proc(s: []byte, c: byte) -> int {
-	_last_index_byte :: #force_inline proc(s: []byte, c: byte) -> int {
+	_last_index_byte :: #force_inline proc "contextless" (s: []byte, c: byte) -> int {
 		for i := len(s)-1; i >= 0; i -= 1 {
 			if s[i] == c {
 				return i
diff --git a/core/simd/util/util.odin b/core/simd/util/util.odin
index b209a44ea..74401689a 100644
--- a/core/simd/util/util.odin
+++ b/core/simd/util/util.odin
@@ -34,7 +34,7 @@ Inputs:
 Returns:
 - index: The index of the byte `c`, or -1 if it was not found.
 */
-index_byte :: proc(data: []u8, c: byte) -> (index: int) #no_bounds_check {
+index_byte :: proc "contextless" (data: []u8, c: byte) -> (index: int) #no_bounds_check {
 	length := len(data)
 	i := 0
 
@@ -101,7 +101,7 @@ Inputs:
 Returns:
 - index: The index of the byte `c`, or -1 if it was not found.
 */
-last_index_byte :: proc(data: []u8, c: byte) -> int #no_bounds_check {
+last_index_byte :: proc "contextless" (data: []u8, c: byte) -> int #no_bounds_check {
 	length := len(data)
 	i := length - 1
 
diff --git a/core/strings/strings.odin b/core/strings/strings.odin
index b8e43f90d..ed7f494ae 100644
--- a/core/strings/strings.odin
+++ b/core/strings/strings.odin
@@ -1426,7 +1426,7 @@ Output:
 
 */
 index_byte :: proc(s: string, c: byte) -> (res: int) {
-	_index_byte :: #force_inline proc(s: string, c: byte) -> int {
+	_index_byte :: #force_inline proc "contextless" (s: string, c: byte) -> int {
 		for i := 0; i < len(s); i += 1 {
 			if s[i] == c {
 				return i
@@ -1477,7 +1477,7 @@ Output:
 
 */
 last_index_byte :: proc(s: string, c: byte) -> (res: int) {
-	_last_index_byte :: #force_inline proc(s: string, c: byte) -> int {
+	_last_index_byte :: #force_inline proc "contextless" (s: string, c: byte) -> int {
 		for i := len(s)-1; i >= 0; i -= 1 {
 			if s[i] == c {
 				return i
diff --git a/tests/benchmark/simd/util/benchmark_simd_util.odin b/tests/benchmark/simd/util/benchmark_simd_util.odin
index 18fa0a9e3..e2187ce45 100644
--- a/tests/benchmark/simd/util/benchmark_simd_util.odin
+++ b/tests/benchmark/simd/util/benchmark_simd_util.odin
@@ -9,7 +9,7 @@ import "core:time"
 
 // These are the normal, unoptimized algorithms.
 
-plain_index_byte :: proc(s: []u8, c: byte) -> (res: int) #no_bounds_check {
+plain_index_byte :: proc "contextless" (s: []u8, c: byte) -> (res: int) #no_bounds_check {
 	for i := 0; i < len(s); i += 1 {
 		if s[i] == c {
 			return i
@@ -18,7 +18,7 @@ plain_index_byte :: proc(s: []u8, c: byte) -> (res: int) #no_bounds_check {
 	return -1
 }
 
-plain_last_index_byte :: proc(s: []u8, c: byte) -> (res: int) #no_bounds_check {
+plain_last_index_byte :: proc "contextless" (s: []u8, c: byte) -> (res: int) #no_bounds_check {
 	for i := len(s)-1; i >= 0; i -= 1 {
 		if s[i] == c {
 			return i
@@ -37,7 +37,7 @@ sizes := [?]int {
 	1024 * 1024 * 1024,
 }
 
-run_trial_size :: proc(p: proc([]u8, byte) -> int, size: int, idx: int, warmup: int, runs: int) -> (timing: time.Duration) {
+run_trial_size :: proc(p: proc "contextless" ([]u8, byte) -> int, size: int, idx: int, warmup: int, runs: int) -> (timing: time.Duration) {
 	data := make([]u8, size)
 	defer delete(data)
 

From 0d29cc3375d4f20e122df23726cc526f96bf3305 Mon Sep 17 00:00:00 2001
From: Feoramund <161657516+Feoramund@users.noreply.github.com>
Date: Fri, 9 Aug 2024 17:46:47 -0400
Subject: [PATCH 08/13] Use `for x in y` construct for `bytes` iteration

This cannot be applied to the `strings` version, as that would cause a
rune-by-rune iteration, not a byte-by-byte one.
---
 core/bytes/bytes.odin | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin
index 4edd089b9..e130502a1 100644
--- a/core/bytes/bytes.odin
+++ b/core/bytes/bytes.odin
@@ -298,8 +298,8 @@ split_after_iterator :: proc(s: ^[]byte, sep: []byte) -> ([]byte, bool) {
 
 index_byte :: proc(s: []byte, c: byte) -> int {
 	_index_byte :: #force_inline proc "contextless" (s: []byte, c: byte) -> int {
-		for i := 0; i < len(s); i += 1 {
-			if s[i] == c {
+		for ch, i in s {
+			if ch == c {
 				return i
 			}
 		}
@@ -319,8 +319,8 @@ index_byte :: proc(s: []byte, c: byte) -> int {
 // Returns -1 if c is not present
 last_index_byte :: proc(s: []byte, c: byte) -> int {
 	_last_index_byte :: #force_inline proc "contextless" (s: []byte, c: byte) -> int {
-		for i := len(s)-1; i >= 0; i -= 1 {
-			if s[i] == c {
+		#reverse for ch, i in s {
+			if ch == c {
 				return i
 			}
 		}

From e7e7fe766a2e9171b7edff58cfdf889a41e1094e Mon Sep 17 00:00:00 2001
From: Feoramund <161657516+Feoramund@users.noreply.github.com>
Date: Fri, 9 Aug 2024 17:47:27 -0400
Subject: [PATCH 09/13] Add test for misaligned data to `core:simd/util` suite

---
 tests/core/simd/util/test_core_simd_util.odin | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/tests/core/simd/util/test_core_simd_util.odin b/tests/core/simd/util/test_core_simd_util.odin
index ff7e1f9aa..ba302a121 100644
--- a/tests/core/simd/util/test_core_simd_util.odin
+++ b/tests/core/simd/util/test_core_simd_util.odin
@@ -105,3 +105,37 @@ test_index_byte_zero :: proc(t: ^testing.T) {
 		}
 	}
 }
+
+@test
+test_misaligned_data :: proc(t: ^testing.T) {
+	for n in 2..<256 {
+		data := make([]u8, n)
+		defer delete(data)
+		for i in 0..<n-1 {
+			data[i] = '-'
+		}
+
+		for m in 1..<n {
+			data[n-1] = 'o'
+			if !testing.expect_value(t, simd_util.index_byte(data[m:n], 'o'), n-1-m) {
+				return
+			}
+			data[n-1] = '-'
+
+			data[m+(n-m)/2] = 'o'
+			if !testing.expect_value(t, simd_util.index_byte(data[m:n], 'o'), (n-m)/2) {
+				return
+			}
+			if !testing.expect_value(t, simd_util.last_index_byte(data[m:n], 'o'), (n-m)/2) {
+				return
+			}
+			data[m+(n-m)/2] = '-'
+
+			data[m]   = 'o'
+			if !testing.expect_value(t, simd_util.last_index_byte(data[m:n], 'o'), 0) {
+				return
+			}
+			data[m]   = '-'
+		}
+	}
+}

From c69fa87d53297325d235d1a5ff57d84655ae2217 Mon Sep 17 00:00:00 2001
From: Feoramund <161657516+Feoramund@users.noreply.github.com>
Date: Sat, 10 Aug 2024 07:17:03 -0400
Subject: [PATCH 10/13] Merge `core:simd/util` into `core:bytes`

---
 core/bytes/bytes.odin                         | 151 ++++++++++++++---
 core/simd/util/util.odin                      | 160 ------------------
 core/strings/strings.odin                     |  35 +---
 examples/all/all_main.odin                    |   2 -
 tests/benchmark/all.odin                      |   2 +-
 .../benchmark_bytes.odin}                     |  18 +-
 .../test_core_bytes.odin}                     |  40 ++---
 tests/core/normal.odin                        |   2 +-
 8 files changed, 164 insertions(+), 246 deletions(-)
 delete mode 100644 core/simd/util/util.odin
 rename tests/benchmark/{simd/util/benchmark_simd_util.odin => bytes/benchmark_bytes.odin} (75%)
 rename tests/core/{simd/util/test_core_simd_util.odin => bytes/test_core_bytes.odin} (54%)

diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin
index e130502a1..e09859a19 100644
--- a/core/bytes/bytes.odin
+++ b/core/bytes/bytes.odin
@@ -2,10 +2,21 @@ package bytes
 
 import "base:intrinsics"
 import "core:mem"
-@require import simd_util "core:simd/util"
 import "core:unicode"
 import "core:unicode/utf8"
 
+
+@private SIMD_SCAN_WIDTH :: 32
+
+@(private, rodata)
+simd_scanner_indices := #simd[SIMD_SCAN_WIDTH]u8 {
+	 0,  1,  2,  3,  4,  5,  6,  7,
+	 8,  9, 10, 11, 12, 13, 14, 15,
+	16, 17, 18, 19, 20, 21, 22, 23,
+	24, 25, 26, 27, 28, 29, 30, 31,
+}
+
+
 clone :: proc(s: []byte, allocator := context.allocator, loc := #caller_location) -> []byte {
 	c := make([]byte, len(s), allocator, loc)
 	copy(c, s)
@@ -295,43 +306,141 @@ split_after_iterator :: proc(s: ^[]byte, sep: []byte) -> ([]byte, bool) {
 	return _split_iterator(s, sep, len(sep))
 }
 
+/*
+Scan a slice of bytes for a specific byte.
 
-index_byte :: proc(s: []byte, c: byte) -> int {
-	_index_byte :: #force_inline proc "contextless" (s: []byte, c: byte) -> int {
-		for ch, i in s {
-			if ch == c {
+This procedure safely handles slices of any length, including empty slices.
+
+Inputs:
+- data: A slice of bytes.
+- c: The byte to search for.
+
+Returns:
+- index: The index of the byte `c`, or -1 if it was not found.
+*/
+index_byte :: proc(s: []byte, c: byte) -> (index: int) #no_bounds_check {
+	length := len(s)
+	i := 0
+
+	// Guard against small strings.
+	if length < SIMD_SCAN_WIDTH {
+		for /**/; i < length; i += 1 {
+			if s[i] == c {
 				return i
 			}
 		}
 		return -1
 	}
 
-	// NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a
-	// significant speedup when compiling in either Size or Speed mode.
-	// The SIMD version is usually 2-3x slower without optimizations on.
-	when ODIN_OPTIMIZATION_MODE > .Minimal {
-		return #force_inline simd_util.index_byte(s, c)
-	} else {
-		return _index_byte(s, c)
+	ptr := cast(int)cast(uintptr)raw_data(s)
+
+	alignment_start := (SIMD_SCAN_WIDTH - ptr % SIMD_SCAN_WIDTH) % SIMD_SCAN_WIDTH
+
+	// Iterate as a scalar until the data is aligned on a `SIMD_SCAN_WIDTH` boundary.
+	//
+	// This way, every load in the vector loop will be aligned, which should be
+	// the fastest possible scenario.
+	for /**/; i < alignment_start; i += 1 {
+		if s[i] == c {
+			return i
+		}
 	}
+
+	// Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level.
+	scanner: #simd[SIMD_SCAN_WIDTH]u8 = c
+	tail := length - (length - alignment_start) % SIMD_SCAN_WIDTH
+
+	for /**/; i < tail; i += SIMD_SCAN_WIDTH {
+		load := (cast(^#simd[SIMD_SCAN_WIDTH]u8)(&s[i]))^
+		comparison := intrinsics.simd_lanes_eq(load, scanner)
+		match := intrinsics.simd_reduce_or(comparison)
+		if match > 0 {
+			sentinel: #simd[SIMD_SCAN_WIDTH]u8 = u8(0xFF)
+			index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel)
+			index_reduce := intrinsics.simd_reduce_min(index_select)
+			return i + cast(int)index_reduce
+		}
+	}
+
+	// Iterate as a scalar over the remaining unaligned portion.
+	for /**/; i < length; i += 1 {
+		if s[i] == c {
+			return i
+		}
+	}
+
+	return -1
 }
 
-// Returns -1 if c is not present
-last_index_byte :: proc(s: []byte, c: byte) -> int {
-	_last_index_byte :: #force_inline proc "contextless" (s: []byte, c: byte) -> int {
-		#reverse for ch, i in s {
-			if ch == c {
+/*
+Scan a slice of bytes for a specific byte, starting from the end and working
+backwards to the start.
+
+This procedure safely handles slices of any length, including empty slices.
+
+Inputs:
+- data: A slice of bytes.
+- c: The byte to search for.
+
+Returns:
+- index: The index of the byte `c`, or -1 if it was not found.
+*/
+last_index_byte :: proc(s: []byte, c: byte) -> int #no_bounds_check {
+	length := len(s)
+	i := length - 1
+
+	// Guard against small strings.
+	if length < SIMD_SCAN_WIDTH {
+		for /**/; i >= 0; i -= 1 {
+			if s[i] == c {
 				return i
 			}
 		}
 		return -1
 	}
 
-	when ODIN_OPTIMIZATION_MODE > .Minimal {
-		return #force_inline simd_util.last_index_byte(s, c)
-	} else {
-		return _last_index_byte(s, c)
+	ptr := cast(int)cast(uintptr)raw_data(s)
+
+	tail := length - (ptr + length) % SIMD_SCAN_WIDTH
+
+	// Iterate as a scalar until the data is aligned on a `SIMD_SCAN_WIDTH` boundary.
+	//
+	// This way, every load in the vector loop will be aligned, which should be
+	// the fastest possible scenario.
+	for /**/; i >= tail; i -= 1 {
+		if s[i] == c {
+			return i
+		}
 	}
+
+	// Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level.
+	scanner: #simd[SIMD_SCAN_WIDTH]u8 = c
+	alignment_start := (SIMD_SCAN_WIDTH - ptr % SIMD_SCAN_WIDTH) % SIMD_SCAN_WIDTH
+
+	i -= SIMD_SCAN_WIDTH - 1
+
+	for /**/; i >= alignment_start; i -= SIMD_SCAN_WIDTH {
+		load := (cast(^#simd[SIMD_SCAN_WIDTH]u8)(&s[i]))^
+		comparison := intrinsics.simd_lanes_eq(load, scanner)
+		match := intrinsics.simd_reduce_or(comparison)
+		if match > 0 {
+			sentinel: #simd[SIMD_SCAN_WIDTH]u8
+			index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel)
+			index_reduce := intrinsics.simd_reduce_max(index_select)
+			return i + cast(int)index_reduce
+		}
+	}
+
+	// Iterate as a scalar over the remaining unaligned portion.
+	i += SIMD_SCAN_WIDTH - 1
+
+	for /**/; i >= 0; i -= 1 {
+		if s[i] == c {
+			return i
+		}
+	}
+
+	return -1
 }
 
 
diff --git a/core/simd/util/util.odin b/core/simd/util/util.odin
deleted file mode 100644
index 74401689a..000000000
--- a/core/simd/util/util.odin
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
-	(c) Copyright 2024 Feoramund <rune@swevencraft.org>.
-	Made available under Odin's BSD-3 license.
-
-	List of contributors:
-		Feoramund: `index_byte` procedures.
-*/
-
-// package simd_util implements compositions of SIMD operations for optimizing
-// the core library where available.
-package simd_util
-
-import "base:intrinsics"
-
-@private SCAN_WIDTH :: 32
-
-@(private, rodata)
-simd_scanner_indices := #simd[SCAN_WIDTH]u8 {
-	 0,  1,  2,  3,  4,  5,  6,  7,
-	 8,  9, 10, 11, 12, 13, 14, 15,
-	16, 17, 18, 19, 20, 21, 22, 23,
-	24, 25, 26, 27, 28, 29, 30, 31,
-}
-
-/*
-Scan a slice of bytes for a specific byte.
-
-This procedure safely handles slices of any length, including empty slices.
-
-Inputs:
-- data: A slice of bytes.
-- c: The byte to search for.
-
-Returns:
-- index: The index of the byte `c`, or -1 if it was not found.
-*/
-index_byte :: proc "contextless" (data: []u8, c: byte) -> (index: int) #no_bounds_check {
-	length := len(data)
-	i := 0
-
-	// Guard against small strings.
-	if length < SCAN_WIDTH {
-		for /**/; i < length; i += 1 {
-			if data[i] == c {
-				return i
-			}
-		}
-		return -1
-	}
-
-	ptr := cast(int)cast(uintptr)raw_data(data)
-
-	alignment_start := (SCAN_WIDTH - ptr % SCAN_WIDTH) % SCAN_WIDTH
-
-	// Iterate as a scalar until the data is aligned on a `SCAN_WIDTH` boundary.
-	//
-	// This way, every load in the vector loop will be aligned, which should be
-	// the fastest possible scenario.
-	for /**/; i < alignment_start; i += 1 {
-		if data[i] == c {
-			return i
-		}
-	}
-
-	// Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level.
-	scanner: #simd[SCAN_WIDTH]u8 = c
-	tail := length - (length - alignment_start) % SCAN_WIDTH
-
-	for /**/; i < tail; i += SCAN_WIDTH {
-		load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^
-		comparison := intrinsics.simd_lanes_eq(load, scanner)
-		match := intrinsics.simd_reduce_or(comparison)
-		if match > 0 {
-			sentinel: #simd[SCAN_WIDTH]u8 = u8(0xFF)
-			index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel)
-			index_reduce := intrinsics.simd_reduce_min(index_select)
-			return i + cast(int)index_reduce
-		}
-	}
-	
-	// Iterate as a scalar over the remaining unaligned portion.
-	for /**/; i < length; i += 1 {
-		if data[i] == c {
-			return i
-		}
-	}
-
-	return -1
-}
-
-/*
-Scan a slice of bytes for a specific byte, starting from the end and working
-backwards to the start.
-
-This procedure safely handles slices of any length, including empty slices.
-
-Inputs:
-- data: A slice of bytes.
-- c: The byte to search for.
-
-Returns:
-- index: The index of the byte `c`, or -1 if it was not found.
-*/
-last_index_byte :: proc "contextless" (data: []u8, c: byte) -> int #no_bounds_check {
-	length := len(data)
-	i := length - 1
-
-	// Guard against small strings.
-	if length < SCAN_WIDTH {
-		for /**/; i >= 0; i -= 1 {
-			if data[i] == c {
-				return i
-			}
-		}
-		return -1
-	}
-
-	ptr := cast(int)cast(uintptr)raw_data(data)
-
-	tail := length - (ptr + length) % SCAN_WIDTH
-
-	// Iterate as a scalar until the data is aligned on a `SCAN_WIDTH` boundary.
-	//
-	// This way, every load in the vector loop will be aligned, which should be
-	// the fastest possible scenario.
-	for /**/; i >= tail; i -= 1 {
-		if data[i] == c {
-			return i
-		}
-	}
-
-	// Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level.
-	scanner: #simd[SCAN_WIDTH]u8 = c
-	alignment_start := (SCAN_WIDTH - ptr % SCAN_WIDTH) % SCAN_WIDTH
-
-	i -= SCAN_WIDTH - 1
-
-	for /**/; i >= alignment_start; i -= SCAN_WIDTH {
-		load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^
-		comparison := intrinsics.simd_lanes_eq(load, scanner)
-		match := intrinsics.simd_reduce_or(comparison)
-		if match > 0 {
-			sentinel: #simd[SCAN_WIDTH]u8
-			index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel)
-			index_reduce := intrinsics.simd_reduce_max(index_select)
-			return i + cast(int)index_reduce
-		}
-	}
-
-	// Iterate as a scalar over the remaining unaligned portion.
-	i += SCAN_WIDTH - 1
-	
-	for /**/; i >= 0; i -= 1 {
-		if data[i] == c {
-			return i
-		}
-	}
-
-	return -1
-}
diff --git a/core/strings/strings.odin b/core/strings/strings.odin
index ed7f494ae..be4275e8b 100644
--- a/core/strings/strings.odin
+++ b/core/strings/strings.odin
@@ -2,8 +2,8 @@
 package strings
 
 import "base:intrinsics"
+import "core:bytes"
 import "core:io"
-@require import simd_util "core:simd/util"
 import "core:mem"
 import "core:unicode"
 import "core:unicode/utf8"
@@ -1426,23 +1426,7 @@ Output:
 
 */
 index_byte :: proc(s: string, c: byte) -> (res: int) {
-	_index_byte :: #force_inline proc "contextless" (s: string, c: byte) -> int {
-		for i := 0; i < len(s); i += 1 {
-			if s[i] == c {
-				return i
-			}
-		}
-		return -1
-	}
-
-	// NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a
-	// significant speedup when compiling in either Size or Speed mode.
-	// The SIMD version is usually 2-3x slower without optimizations on.
-	when ODIN_OPTIMIZATION_MODE > .Minimal {
-		return #force_inline simd_util.index_byte(transmute([]u8)s, c)
-	} else {
-		return _index_byte(s, c)
-	}
+	return #force_inline bytes.index_byte(transmute([]u8)s, c)
 }
 /*
 Returns the byte offset of the last byte `c` in the string `s`, -1 when not found.
@@ -1477,20 +1461,7 @@ Output:
 
 */
 last_index_byte :: proc(s: string, c: byte) -> (res: int) {
-	_last_index_byte :: #force_inline proc "contextless" (s: string, c: byte) -> int {
-		for i := len(s)-1; i >= 0; i -= 1 {
-			if s[i] == c {
-				return i
-			}
-		}
-		return -1
-	}
-
-	when ODIN_OPTIMIZATION_MODE > .Minimal {
-		return #force_inline simd_util.last_index_byte(transmute([]u8)s, c)
-	} else {
-		return _last_index_byte(s, c)
-	}
+	return #force_inline bytes.last_index_byte(transmute([]u8)s, c)
 }
 /*
 Returns the byte offset of the first rune `r` in the string `s` it finds, -1 when not found.
diff --git a/examples/all/all_main.odin b/examples/all/all_main.odin
index 43ea0de98..d92a6b8c4 100644
--- a/examples/all/all_main.odin
+++ b/examples/all/all_main.odin
@@ -115,7 +115,6 @@ import relative         "core:relative"
 import reflect          "core:reflect"
 import runtime          "base:runtime"
 import simd             "core:simd"
-import simd_util        "core:simd/util"
 import x86              "core:simd/x86"
 import slice            "core:slice"
 import slice_heap       "core:slice/heap"
@@ -238,7 +237,6 @@ _ :: relative
 _ :: reflect
 _ :: runtime
 _ :: simd
-_ :: simd_util
 _ :: x86
 _ :: slice
 _ :: slice_heap
diff --git a/tests/benchmark/all.odin b/tests/benchmark/all.odin
index 357d86f67..4fdf82a49 100644
--- a/tests/benchmark/all.odin
+++ b/tests/benchmark/all.odin
@@ -1,5 +1,5 @@
 package benchmarks
 
+@(require) import "bytes"
 @(require) import "crypto"
 @(require) import "hash"
-@(require) import "simd/util"
diff --git a/tests/benchmark/simd/util/benchmark_simd_util.odin b/tests/benchmark/bytes/benchmark_bytes.odin
similarity index 75%
rename from tests/benchmark/simd/util/benchmark_simd_util.odin
rename to tests/benchmark/bytes/benchmark_bytes.odin
index e2187ce45..d303e81dd 100644
--- a/tests/benchmark/simd/util/benchmark_simd_util.odin
+++ b/tests/benchmark/bytes/benchmark_bytes.odin
@@ -1,15 +1,15 @@
-package benchmark_simd_util
+package benchmark_bytes
 
+import "core:bytes"
 import "core:fmt"
 import "core:log"
-import simd_util "core:simd/util"
 import "core:testing"
 import "core:time"
 
 
 // These are the normal, unoptimized algorithms.
 
-plain_index_byte :: proc "contextless" (s: []u8, c: byte) -> (res: int) #no_bounds_check {
+plain_index_byte :: proc(s: []u8, c: byte) -> (res: int) #no_bounds_check {
 	for i := 0; i < len(s); i += 1 {
 		if s[i] == c {
 			return i
@@ -18,7 +18,7 @@ plain_index_byte :: proc "contextless" (s: []u8, c: byte) -> (res: int) #no_boun
 	return -1
 }
 
-plain_last_index_byte :: proc "contextless" (s: []u8, c: byte) -> (res: int) #no_bounds_check {
+plain_last_index_byte :: proc(s: []u8, c: byte) -> (res: int) #no_bounds_check {
 	for i := len(s)-1; i >= 0; i -= 1 {
 		if s[i] == c {
 			return i
@@ -37,7 +37,7 @@ sizes := [?]int {
 	1024 * 1024 * 1024,
 }
 
-run_trial_size :: proc(p: proc "contextless" ([]u8, byte) -> int, size: int, idx: int, warmup: int, runs: int) -> (timing: time.Duration) {
+run_trial_size :: proc(p: proc([]u8, byte) -> int, size: int, idx: int, warmup: int, runs: int) -> (timing: time.Duration) {
 	data := make([]u8, size)
 	defer delete(data)
 
@@ -95,9 +95,9 @@ benchmark_plain_index_hot :: proc(t: ^testing.T) {
 benchmark_simd_index_cold :: proc(t: ^testing.T) {
 	report: string
 	for size in sizes {
-		timing := run_trial_size(simd_util.index_byte, size, size - 1, 0, 1)
+		timing := run_trial_size(bytes.index_byte, size, size - 1, 0, 1)
 		report = fmt.tprintf("%s\n        +++ % 8M | %v", report, size, timing)
-		timing = run_trial_size(simd_util.last_index_byte, size, 0, 0, 1)
+		timing = run_trial_size(bytes.last_index_byte, size, 0, 0, 1)
 		report = fmt.tprintf("%s\n (last) +++ % 8M | %v", report, size, timing)
 	}
 	log.info(report)
@@ -107,9 +107,9 @@ benchmark_simd_index_cold :: proc(t: ^testing.T) {
 benchmark_simd_index_hot :: proc(t: ^testing.T) {
 	report: string
 	for size in sizes {
-		timing := run_trial_size(simd_util.index_byte, size, size - 1, HOT, HOT)
+		timing := run_trial_size(bytes.index_byte, size, size - 1, HOT, HOT)
 		report = fmt.tprintf("%s\n        +++ % 8M | %v", report, size, timing)
-		timing = run_trial_size(simd_util.last_index_byte, size, 0, HOT, HOT)
+		timing = run_trial_size(bytes.last_index_byte, size, 0, HOT, HOT)
 		report = fmt.tprintf("%s\n (last) +++ % 8M | %v", report, size, timing)
 	}
 	log.info(report)
diff --git a/tests/core/simd/util/test_core_simd_util.odin b/tests/core/bytes/test_core_bytes.odin
similarity index 54%
rename from tests/core/simd/util/test_core_simd_util.odin
rename to tests/core/bytes/test_core_bytes.odin
index ba302a121..9074c0205 100644
--- a/tests/core/simd/util/test_core_simd_util.odin
+++ b/tests/core/bytes/test_core_bytes.odin
@@ -1,6 +1,6 @@
-package test_core_simd_util
+package test_core_bytes
 
-import simd_util "core:simd/util"
+import "core:bytes"
 import "core:testing"
 
 @test
@@ -15,30 +15,30 @@ test_index_byte_sanity :: proc(t: ^testing.T) {
 
 		// Find it at the end.
 		data[n-1] = 'o'
-		if !testing.expect_value(t, simd_util.index_byte(data, 'o'), n-1) {
+		if !testing.expect_value(t, bytes.index_byte(data, 'o'), n-1) {
 			return
 		}
-		if !testing.expect_value(t, simd_util.last_index_byte(data, 'o'), n-1) {
+		if !testing.expect_value(t, bytes.last_index_byte(data, 'o'), n-1) {
 			return
 		}
 		data[n-1] = '-'
 
 		// Find it in the middle.
 		data[n/2] = 'o'
-		if !testing.expect_value(t, simd_util.index_byte(data, 'o'), n/2) {
+		if !testing.expect_value(t, bytes.index_byte(data, 'o'), n/2) {
 			return
 		}
-		if !testing.expect_value(t, simd_util.last_index_byte(data, 'o'), n/2) {
+		if !testing.expect_value(t, bytes.last_index_byte(data, 'o'), n/2) {
 			return
 		}
 		data[n/2] = '-'
 
 		// Find it at the start.
 		data[0] = 'o'
-		if !testing.expect_value(t, simd_util.index_byte(data, 'o'), 0) {
+		if !testing.expect_value(t, bytes.index_byte(data, 'o'), 0) {
 			return
 		}
-		if !testing.expect_value(t, simd_util.last_index_byte(data, 'o'), 0) {
+		if !testing.expect_value(t, bytes.last_index_byte(data, 'o'), 0) {
 			return
 		}
 	}
@@ -47,8 +47,8 @@ test_index_byte_sanity :: proc(t: ^testing.T) {
 @test
 test_index_byte_empty :: proc(t: ^testing.T) {
 	a: [1]u8
-	testing.expect_value(t, simd_util.index_byte(a[0:0], 'o'), -1)
-	testing.expect_value(t, simd_util.last_index_byte(a[0:0], 'o'), -1)
+	testing.expect_value(t, bytes.index_byte(a[0:0], 'o'), -1)
+	testing.expect_value(t, bytes.last_index_byte(a[0:0], 'o'), -1)
 }
 
 @test
@@ -65,12 +65,12 @@ test_index_byte_multiple_hits :: proc(t: ^testing.T) {
 		data[n-5] = 'o'
 
 		// Find the first one.
-		if !testing.expect_value(t, simd_util.index_byte(data, 'o'), n-5) {
+		if !testing.expect_value(t, bytes.index_byte(data, 'o'), n-5) {
 			return
 		}
 
 		// Find the last one.
-		if !testing.expect_value(t, simd_util.last_index_byte(data, 'o'), n-1) {
+		if !testing.expect_value(t, bytes.last_index_byte(data, 'o'), n-1) {
 			return
 		}
 	}
@@ -88,19 +88,19 @@ test_index_byte_zero :: proc(t: ^testing.T) {
 
 		// Positive hit.
 		data[n-1] = 0
-		if !testing.expect_value(t, simd_util.index_byte(data[:n], 0), n-1) {
+		if !testing.expect_value(t, bytes.index_byte(data[:n], 0), n-1) {
 			return
 		}
-		if !testing.expect_value(t, simd_util.last_index_byte(data[:n], 0), n-1) {
+		if !testing.expect_value(t, bytes.last_index_byte(data[:n], 0), n-1) {
 			return
 		}
 
 		// Test for false positives.
 		data[n-1] = '-'
-		if !testing.expect_value(t, simd_util.index_byte(data[:n], 0), -1) {
+		if !testing.expect_value(t, bytes.index_byte(data[:n], 0), -1) {
 			return
 		}
-		if !testing.expect_value(t, simd_util.last_index_byte(data[:n], 0), -1) {
+		if !testing.expect_value(t, bytes.last_index_byte(data[:n], 0), -1) {
 			return
 		}
 	}
@@ -117,22 +117,22 @@ test_misaligned_data :: proc(t: ^testing.T) {
 
 		for m in 1..<n {
 			data[n-1] = 'o'
-			if !testing.expect_value(t, simd_util.index_byte(data[m:n], 'o'), n-1-m) {
+			if !testing.expect_value(t, bytes.index_byte(data[m:n], 'o'), n-1-m) {
 				return
 			}
 			data[n-1] = '-'
 
 			data[m+(n-m)/2] = 'o'
-			if !testing.expect_value(t, simd_util.index_byte(data[m:n], 'o'), (n-m)/2) {
+			if !testing.expect_value(t, bytes.index_byte(data[m:n], 'o'), (n-m)/2) {
 				return
 			}
-			if !testing.expect_value(t, simd_util.last_index_byte(data[m:n], 'o'), (n-m)/2) {
+			if !testing.expect_value(t, bytes.last_index_byte(data[m:n], 'o'), (n-m)/2) {
 				return
 			}
 			data[m+(n-m)/2] = '-'
 
 			data[m]   = 'o'
-			if !testing.expect_value(t, simd_util.last_index_byte(data[m:n], 'o'), 0) {
+			if !testing.expect_value(t, bytes.last_index_byte(data[m:n], 'o'), 0) {
 				return
 			}
 			data[m]   = '-'
diff --git a/tests/core/normal.odin b/tests/core/normal.odin
index a1b948fea..e35d86598 100644
--- a/tests/core/normal.odin
+++ b/tests/core/normal.odin
@@ -9,6 +9,7 @@ download_assets :: proc() {
 	}
 }
 
+@(require) import "bytes"
 @(require) import "c/libc"
 @(require) import "compress"
 @(require) import "container"
@@ -34,7 +35,6 @@ download_assets :: proc() {
 @(require) import "path/filepath"
 @(require) import "reflect"
 @(require) import "runtime"
-@(require) import "simd/util"
 @(require) import "slice"
 @(require) import "strconv"
 @(require) import "strings"

From 5d5addd48fef09132c08e4b570675491dd5cdb76 Mon Sep 17 00:00:00 2001
From: Feoramund <161657516+Feoramund@users.noreply.github.com>
Date: Sat, 10 Aug 2024 07:18:49 -0400
Subject: [PATCH 11/13] Set `SIMD_SCAN_WIDTH` based on `size_of(uintptr)`

---
 core/bytes/bytes.odin | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin
index e09859a19..8e7bc01bd 100644
--- a/core/bytes/bytes.odin
+++ b/core/bytes/bytes.odin
@@ -6,14 +6,30 @@ import "core:unicode"
 import "core:unicode/utf8"
 
 
-@private SIMD_SCAN_WIDTH :: 32
+@private SIMD_SCAN_WIDTH :: 8 * size_of(uintptr)
 
-@(private, rodata)
-simd_scanner_indices := #simd[SIMD_SCAN_WIDTH]u8 {
-	 0,  1,  2,  3,  4,  5,  6,  7,
-	 8,  9, 10, 11, 12, 13, 14, 15,
-	16, 17, 18, 19, 20, 21, 22, 23,
-	24, 25, 26, 27, 28, 29, 30, 31,
+when SIMD_SCAN_WIDTH == 32 {
+	@(private, rodata)
+	simd_scanner_indices := #simd[SIMD_SCAN_WIDTH]u8 {
+		 0,  1,  2,  3,  4,  5,  6,  7,
+		 8,  9, 10, 11, 12, 13, 14, 15,
+		16, 17, 18, 19, 20, 21, 22, 23,
+		24, 25, 26, 27, 28, 29, 30, 31,
+	}
+} else when SIMD_SCAN_WIDTH == 64 {
+	@(private, rodata)
+	simd_scanner_indices := #simd[SIMD_SCAN_WIDTH]u8 {
+		 0,  1,  2,  3,  4,  5,  6,  7,
+		 8,  9, 10, 11, 12, 13, 14, 15,
+		16, 17, 18, 19, 20, 21, 22, 23,
+		24, 25, 26, 27, 28, 29, 30, 31,
+		32, 33, 34, 35, 36, 37, 38, 39,
+		40, 41, 42, 43, 44, 45, 46, 47,
+		48, 49, 50, 51, 52, 53, 54, 55,
+		56, 57, 58, 59, 60, 61, 62, 63,
+	}
+} else {
+	#panic("Invalid SIMD_SCAN_WIDTH. Must be 32 or 64.")
 }
 
 

From 9d2b4b2f03a30e296e55f0f5f9ce33e20303f55b Mon Sep 17 00:00:00 2001
From: Feoramund <161657516+Feoramund@users.noreply.github.com>
Date: Sat, 10 Aug 2024 08:13:22 -0400
Subject: [PATCH 12/13] Simplify `core:bytes` test

---
 tests/core/bytes/test_core_bytes.odin | 90 +++++----------------------
 1 file changed, 17 insertions(+), 73 deletions(-)

diff --git a/tests/core/bytes/test_core_bytes.odin b/tests/core/bytes/test_core_bytes.odin
index 9074c0205..72390291f 100644
--- a/tests/core/bytes/test_core_bytes.odin
+++ b/tests/core/bytes/test_core_bytes.odin
@@ -1,45 +1,27 @@
 package test_core_bytes
 
 import "core:bytes"
+import "core:slice"
 import "core:testing"
 
 @test
 test_index_byte_sanity :: proc(t: ^testing.T) {
 	// We must be able to find the byte at the correct index.
-	for n in 1..<256 {
-		data := make([]u8, n)
-		defer delete(data)
-		for i in 0..<n-1 {
-			data[i] = '-'
-		}
+	data := make([]u8, 64)
+	defer delete(data)
+	slice.fill(data, '-')
 
-		// Find it at the end.
-		data[n-1] = 'o'
-		if !testing.expect_value(t, bytes.index_byte(data, 'o'), n-1) {
-			return
-		}
-		if !testing.expect_value(t, bytes.last_index_byte(data, 'o'), n-1) {
-			return
-		}
-		data[n-1] = '-'
-
-		// Find it in the middle.
-		data[n/2] = 'o'
-		if !testing.expect_value(t, bytes.index_byte(data, 'o'), n/2) {
-			return
-		}
-		if !testing.expect_value(t, bytes.last_index_byte(data, 'o'), n/2) {
-			return
-		}
-		data[n/2] = '-'
-
-		// Find it at the start.
-		data[0] = 'o'
-		if !testing.expect_value(t, bytes.index_byte(data, 'o'), 0) {
-			return
-		}
-		if !testing.expect_value(t, bytes.last_index_byte(data, 'o'), 0) {
-			return
+	for offset in 0..<31 {
+		for idx in 0..<31 {
+			sub := data[offset:]
+			sub[idx] = 'o'
+			if !testing.expect_value(t, bytes.index_byte(sub, 'o'), idx) {
+				return
+			}
+			if !testing.expect_value(t, bytes.last_index_byte(sub, 'o'), idx) {
+				return
+			}
+			sub[idx] = '-'
 		}
 	}
 }
@@ -56,9 +38,7 @@ test_index_byte_multiple_hits :: proc(t: ^testing.T) {
 	for n in 5..<256 {
 		data := make([]u8, n)
 		defer delete(data)
-		for i in 0..<n-1 {
-			data[i] = '-'
-		}
+		slice.fill(data, '-')
 
 		data[n-1] = 'o'
 		data[n-3] = 'o'
@@ -82,9 +62,7 @@ test_index_byte_zero :: proc(t: ^testing.T) {
 	for n in 1..<256 {
 		data := make([]u8, n + 64)
 		defer delete(data)
-		for i in 0..<n-1 {
-			data[i] = '-'
-		}
+		slice.fill(data, '-')
 
 		// Positive hit.
 		data[n-1] = 0
@@ -105,37 +83,3 @@ test_index_byte_zero :: proc(t: ^testing.T) {
 		}
 	}
 }
-
-@test
-test_misaligned_data :: proc(t: ^testing.T) {
-	for n in 2..<256 {
-		data := make([]u8, n)
-		defer delete(data)
-		for i in 0..<n-1 {
-			data[i] = '-'
-		}
-
-		for m in 1..<n {
-			data[n-1] = 'o'
-			if !testing.expect_value(t, bytes.index_byte(data[m:n], 'o'), n-1-m) {
-				return
-			}
-			data[n-1] = '-'
-
-			data[m+(n-m)/2] = 'o'
-			if !testing.expect_value(t, bytes.index_byte(data[m:n], 'o'), (n-m)/2) {
-				return
-			}
-			if !testing.expect_value(t, bytes.last_index_byte(data[m:n], 'o'), (n-m)/2) {
-				return
-			}
-			data[m+(n-m)/2] = '-'
-
-			data[m]   = 'o'
-			if !testing.expect_value(t, bytes.last_index_byte(data[m:n], 'o'), 0) {
-				return
-			}
-			data[m]   = '-'
-		}
-	}
-}

From 4f816aabb3b008aae201cd092610dfda7f6715c8 Mon Sep 17 00:00:00 2001
From: Feoramund <161657516+Feoramund@users.noreply.github.com>
Date: Sat, 10 Aug 2024 13:51:18 -0400
Subject: [PATCH 13/13] Use `SIMD_SCAN_WIDTH` constant in `core:bytes` test

---
 tests/core/bytes/test_core_bytes.odin | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tests/core/bytes/test_core_bytes.odin b/tests/core/bytes/test_core_bytes.odin
index 72390291f..fb3c460aa 100644
--- a/tests/core/bytes/test_core_bytes.odin
+++ b/tests/core/bytes/test_core_bytes.odin
@@ -4,15 +4,19 @@ import "core:bytes"
 import "core:slice"
 import "core:testing"
 
+@private SIMD_SCAN_WIDTH :: 8 * size_of(uintptr)
+
 @test
 test_index_byte_sanity :: proc(t: ^testing.T) {
 	// We must be able to find the byte at the correct index.
-	data := make([]u8, 64)
+	data := make([]u8, 2 * SIMD_SCAN_WIDTH)
 	defer delete(data)
 	slice.fill(data, '-')
 
-	for offset in 0..<31 {
-		for idx in 0..<31 {
+	INDEX_MAX :: SIMD_SCAN_WIDTH - 1
+
+	for offset in 0..<INDEX_MAX {
+		for idx in 0..<INDEX_MAX {
 			sub := data[offset:]
 			sub[idx] = 'o'
 			if !testing.expect_value(t, bytes.index_byte(sub, 'o'), idx) {