From 8deeb40e5de1d33b571f0b1faf7b8dea678cd91b Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sun, 4 Aug 2024 15:47:00 -0400 Subject: [PATCH 01/13] Add vectorized `index_byte` and `last_index_byte` --- core/simd/util/util.odin | 188 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 188 insertions(+) create mode 100644 core/simd/util/util.odin diff --git a/core/simd/util/util.odin b/core/simd/util/util.odin new file mode 100644 index 000000000..ac523b42a --- /dev/null +++ b/core/simd/util/util.odin @@ -0,0 +1,188 @@ +/* + (c) Copyright 2024 Feoramund . + Made available under Odin's BSD-3 license. + + List of contributors: + Feoramund: `index_byte` procedures. +*/ + +// package simd_util implements compositions of SIMD operations for optimizing +// the core library where available. + +//+build i386, amd64 +package simd_util + +import "base:intrinsics" +import "core:simd/x86" + +@private SCAN_REGISTER_SIZE :: 16 +@private SCAN_REGISTERS :: 4 +@private SCAN_WIDTH :: SCAN_REGISTERS * SCAN_REGISTER_SIZE + +// How long should a string be before using any of the `index_*` procedures in +// this package. +RECOMMENDED_SCAN_SIZE :: SCAN_REGISTER_SIZE + +/* +Scan a slice of bytes for a specific byte. + +This procedure safely handles padding out slices of any length, including empty +slices. + +Inputs: +- data: A slice of bytes. +- c: The byte to search for. + +Returns: +- index: The index of the byte `c`, or -1 if it was not found. +*/ +@(enable_target_feature="sse2") +index_byte :: proc(data: []u8, c: byte) -> (index: int) #no_bounds_check { + scanner_data: [SCAN_REGISTER_SIZE]u8 = c + scanner := intrinsics.unaligned_load(cast(^x86.__m128i)&scanner_data[0]) + + i: int + length := len(data) + full_chunks_length := length - length % SCAN_WIDTH + + for /**/; i < full_chunks_length; i += SCAN_WIDTH { + simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&data[i]) + + #unroll for j in 0..> 63) = 0x0000_0000_0000_0001 + // + // The multiplication is a guard against zero. + // + submask = ~submask * (submask >> 63) + // + // Finally, mask out any irrelevant bits with the submask. + mask &= i32(submask) + + if mask != 0 { + ctz := int(intrinsics.count_trailing_zeros(mask)) + return i + j * SCAN_REGISTER_SIZE + ctz + } + } + } + + return -1 +} + +/* +Scan a slice of bytes for a specific byte, starting from the end and working +backwards to the start. + +This procedure safely handles padding out slices of any length, including empty +slices. + +Inputs: +- data: A slice of bytes. +- c: The byte to search for. + +Returns: +- index: The index of the byte `c`, or -1 if it was not found. +*/ +@(enable_target_feature="sse2") +last_index_byte :: proc(data: []u8, c: byte) -> int #no_bounds_check { + scanner_data: [SCAN_REGISTER_SIZE]u8 = c + scanner := intrinsics.unaligned_load(cast(^x86.__m128i)&scanner_data[0]) + + i := len(data) - SCAN_WIDTH + + for /**/; i >= 0; i -= SCAN_WIDTH { + simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&data[i]) + + // There is no #reverse #unroll at the time of this writing, so we use + // `j` to count down by subtraction. + #unroll for j in 1..=SCAN_REGISTERS { + cmp := x86._mm_cmpeq_epi8(simd_load[SCAN_REGISTERS-j], scanner) + mask := x86._mm_movemask_epi8(cmp) + + if mask != 0 { + // CLZ is used instead to get the on-bit from the other end. + clz := (8 * size_of(mask) - 1) - int(intrinsics.count_leading_zeros(mask)) + return i + SCAN_WIDTH - j * SCAN_REGISTER_SIZE + clz + } + } + } + + if i < 0 { + padded_data_end: [SCAN_WIDTH]u8 = --- + remnant_length := len(data) % SCAN_WIDTH + intrinsics.mem_copy_non_overlapping( + &padded_data_end[0], + &raw_data(data)[0], + remnant_length, + ) + + simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&padded_data_end[0]) + + #unroll for j in 1..=SCAN_REGISTERS { + cmp := x86._mm_cmpeq_epi8(simd_load[SCAN_REGISTERS-j], scanner) + mask := x86._mm_movemask_epi8(cmp) + + submask := max(u64) << u64(remnant_length - (SCAN_REGISTERS-j) * SCAN_REGISTER_SIZE) + submask = ~submask * (submask >> 63) + + mask &= i32(submask) + + if mask != 0 { + clz := (8 * size_of(mask) - 1) - int(intrinsics.count_leading_zeros(mask)) + return SCAN_WIDTH - j * SCAN_REGISTER_SIZE + clz + } + } + } + + return -1 +} From f66fcd9acb390b199452a125ed09899dffefde5d Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sun, 4 Aug 2024 15:58:56 -0400 Subject: [PATCH 02/13] Use vectorized `index_*` procs in `core` --- core/bytes/bytes.odin | 47 ++++++++++++++++++++++++++++++++------- core/strings/strings.odin | 47 ++++++++++++++++++++++++++++++++------- 2 files changed, 78 insertions(+), 16 deletions(-) diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin index 7cbf092ac..dcd4931e2 100644 --- a/core/bytes/bytes.odin +++ b/core/bytes/bytes.odin @@ -1,6 +1,8 @@ package bytes +import "base:intrinsics" import "core:mem" +@require import simd_util "core:simd/util" import "core:unicode" import "core:unicode/utf8" @@ -295,22 +297,51 @@ split_after_iterator :: proc(s: ^[]byte, sep: []byte) -> ([]byte, bool) { index_byte :: proc(s: []byte, c: byte) -> int { - for i := 0; i < len(s); i += 1 { - if s[i] == c { - return i + _index_byte :: #force_inline proc(s: []byte, c: byte) -> int { + for i := 0; i < len(s); i += 1 { + if s[i] == c { + return i + } } + return -1 + } + + // NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a + // significant speedup when compiling in either Size or Speed mode. + // The SIMD version is usually 2-3x slower without optimizations on. + when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") { + // SIMD's benefits are noticeable only past a certain threshold of data. + // For small data, use the plain old algorithm. + if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE { + return simd_util.index_byte(s, c) + } else { + return _index_byte(s, c) + } + } else { + return _index_byte(s, c) } - return -1 } // Returns -1 if c is not present last_index_byte :: proc(s: []byte, c: byte) -> int { - for i := len(s)-1; i >= 0; i -= 1 { - if s[i] == c { - return i + _last_index_byte :: #force_inline proc(s: []byte, c: byte) -> int { + for i := len(s)-1; i >= 0; i -= 1 { + if s[i] == c { + return i + } } + return -1 + } + + when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") { + if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE { + return simd_util.last_index_byte(s, c) + } else { + return _last_index_byte(s, c) + } + } else { + return _last_index_byte(s, c) } - return -1 } diff --git a/core/strings/strings.odin b/core/strings/strings.odin index e9b50bab0..9d3e88165 100644 --- a/core/strings/strings.odin +++ b/core/strings/strings.odin @@ -1,7 +1,9 @@ // Procedures to manipulate UTF-8 encoded strings package strings +import "base:intrinsics" import "core:io" +@require import simd_util "core:simd/util" import "core:mem" import "core:unicode" import "core:unicode/utf8" @@ -1424,12 +1426,29 @@ Output: */ index_byte :: proc(s: string, c: byte) -> (res: int) { - for i := 0; i < len(s); i += 1 { - if s[i] == c { - return i + _index_byte :: #force_inline proc(s: string, c: byte) -> int { + for i := 0; i < len(s); i += 1 { + if s[i] == c { + return i + } } + return -1 + } + + // NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a + // significant speedup when compiling in either Size or Speed mode. + // The SIMD version is usually 2-3x slower without optimizations on. + when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") { + // SIMD's benefits are noticeable only past a certain threshold of data. + // For small data, use the plain old algorithm. + if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE { + return simd_util.index_byte(transmute([]u8)s, c) + } else { + return _index_byte(s, c) + } + } else { + return _index_byte(s, c) } - return -1 } /* Returns the byte offset of the last byte `c` in the string `s`, -1 when not found. @@ -1464,12 +1483,24 @@ Output: */ last_index_byte :: proc(s: string, c: byte) -> (res: int) { - for i := len(s)-1; i >= 0; i -= 1 { - if s[i] == c { - return i + _last_index_byte :: #force_inline proc(s: string, c: byte) -> int { + for i := len(s)-1; i >= 0; i -= 1 { + if s[i] == c { + return i + } } + return -1 + } + + when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") { + if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE { + return simd_util.last_index_byte(transmute([]u8)s, c) + } else { + return _last_index_byte(s, c) + } + } else { + return _last_index_byte(s, c) } - return -1 } /* Returns the byte offset of the first rune `r` in the string `s` it finds, -1 when not found. From 28c98c2e7af8b1ce49ead18398fc7793b00e0df3 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sun, 4 Aug 2024 16:07:19 -0400 Subject: [PATCH 03/13] Add tests for vectorized `index_*` procs --- tests/core/normal.odin | 1 + tests/core/simd/util/test_core_simd_util.odin | 108 ++++++++++++++++++ 2 files changed, 109 insertions(+) create mode 100644 tests/core/simd/util/test_core_simd_util.odin diff --git a/tests/core/normal.odin b/tests/core/normal.odin index 8cd3b3917..a1b948fea 100644 --- a/tests/core/normal.odin +++ b/tests/core/normal.odin @@ -34,6 +34,7 @@ download_assets :: proc() { @(require) import "path/filepath" @(require) import "reflect" @(require) import "runtime" +@(require) import "simd/util" @(require) import "slice" @(require) import "strconv" @(require) import "strings" diff --git a/tests/core/simd/util/test_core_simd_util.odin b/tests/core/simd/util/test_core_simd_util.odin new file mode 100644 index 000000000..65bf566c0 --- /dev/null +++ b/tests/core/simd/util/test_core_simd_util.odin @@ -0,0 +1,108 @@ +//+build i386, amd64 +package test_core_simd_util + +import simd_util "core:simd/util" +import "core:testing" + +@test +test_index_byte_sanity :: proc(t: ^testing.T) { + // We must be able to find the byte at the correct index. + for n in 1..<256 { + data := make([]u8, n) + defer delete(data) + for i in 0.. Date: Sun, 4 Aug 2024 16:13:00 -0400 Subject: [PATCH 04/13] Add benchmarks for vectorized `index_*` procs --- tests/benchmark/all.odin | 1 + .../simd/util/benchmark_simd_util.odin | 117 ++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 tests/benchmark/simd/util/benchmark_simd_util.odin diff --git a/tests/benchmark/all.odin b/tests/benchmark/all.odin index d1b7662e2..357d86f67 100644 --- a/tests/benchmark/all.odin +++ b/tests/benchmark/all.odin @@ -2,3 +2,4 @@ package benchmarks @(require) import "crypto" @(require) import "hash" +@(require) import "simd/util" diff --git a/tests/benchmark/simd/util/benchmark_simd_util.odin b/tests/benchmark/simd/util/benchmark_simd_util.odin new file mode 100644 index 000000000..4538c6612 --- /dev/null +++ b/tests/benchmark/simd/util/benchmark_simd_util.odin @@ -0,0 +1,117 @@ +//+build i386, amd64 +package benchmark_simd_util + +import "core:fmt" +import "core:log" +import simd_util "core:simd/util" +import "core:testing" +import "core:time" + + +// These are the normal, unoptimized algorithms. + +plain_index_byte :: proc(s: []u8, c: byte) -> (res: int) #no_bounds_check { + for i := 0; i < len(s); i += 1 { + if s[i] == c { + return i + } + } + return -1 +} + +plain_last_index_byte :: proc(s: []u8, c: byte) -> (res: int) #no_bounds_check { + for i := len(s)-1; i >= 0; i -= 1 { + if s[i] == c { + return i + } + } + return -1 +} + +sizes := [?]int { + 15, 16, 17, + 31, 32, 33, + 256, + 512, + 1024, + 1024 * 1024, + 1024 * 1024 * 1024, +} + +run_trial_size :: proc(p: proc([]u8, byte) -> int, size: int, idx: int, warmup: int, runs: int) -> (timing: time.Duration) { + data := make([]u8, size) + defer delete(data) + + for i in 0.. Date: Sun, 4 Aug 2024 16:14:17 -0400 Subject: [PATCH 05/13] Add `simd_util` to `examples/all` --- examples/all/all_main.odin | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/all/all_main.odin b/examples/all/all_main.odin index d92a6b8c4..43ea0de98 100644 --- a/examples/all/all_main.odin +++ b/examples/all/all_main.odin @@ -115,6 +115,7 @@ import relative "core:relative" import reflect "core:reflect" import runtime "base:runtime" import simd "core:simd" +import simd_util "core:simd/util" import x86 "core:simd/x86" import slice "core:slice" import slice_heap "core:slice/heap" @@ -237,6 +238,7 @@ _ :: relative _ :: reflect _ :: runtime _ :: simd +_ :: simd_util _ :: x86 _ :: slice _ :: slice_heap From 12dd0cb72a586a99129280c78697089caab0500a Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Fri, 9 Aug 2024 17:39:19 -0400 Subject: [PATCH 06/13] Simplify and make `simd_util` cross-platform This new algorithm uses a Scalar->Vector->Scalar iteration loop which requires no masking off of any incomplete data chunks. Also, the width was reduced to 32 bytes instead of 64, as I found this to be about as fast as the previous 64-byte x86 version. --- core/bytes/bytes.odin | 18 +- core/simd/util/util.odin | 214 ++++++++---------- core/strings/strings.odin | 18 +- .../simd/util/benchmark_simd_util.odin | 1 - tests/core/simd/util/test_core_simd_util.odin | 1 - 5 files changed, 101 insertions(+), 151 deletions(-) diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin index dcd4931e2..136c98f6b 100644 --- a/core/bytes/bytes.odin +++ b/core/bytes/bytes.odin @@ -309,14 +309,8 @@ index_byte :: proc(s: []byte, c: byte) -> int { // NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a // significant speedup when compiling in either Size or Speed mode. // The SIMD version is usually 2-3x slower without optimizations on. - when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") { - // SIMD's benefits are noticeable only past a certain threshold of data. - // For small data, use the plain old algorithm. - if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE { - return simd_util.index_byte(s, c) - } else { - return _index_byte(s, c) - } + when ODIN_OPTIMIZATION_MODE > .Minimal { + return #force_inline simd_util.index_byte(s, c) } else { return _index_byte(s, c) } @@ -333,12 +327,8 @@ last_index_byte :: proc(s: []byte, c: byte) -> int { return -1 } - when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") { - if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE { - return simd_util.last_index_byte(s, c) - } else { - return _last_index_byte(s, c) - } + when ODIN_OPTIMIZATION_MODE > .Minimal { + return #force_inline simd_util.last_index_byte(s, c) } else { return _last_index_byte(s, c) } diff --git a/core/simd/util/util.odin b/core/simd/util/util.odin index ac523b42a..b209a44ea 100644 --- a/core/simd/util/util.odin +++ b/core/simd/util/util.odin @@ -8,26 +8,24 @@ // package simd_util implements compositions of SIMD operations for optimizing // the core library where available. - -//+build i386, amd64 package simd_util import "base:intrinsics" -import "core:simd/x86" -@private SCAN_REGISTER_SIZE :: 16 -@private SCAN_REGISTERS :: 4 -@private SCAN_WIDTH :: SCAN_REGISTERS * SCAN_REGISTER_SIZE +@private SCAN_WIDTH :: 32 -// How long should a string be before using any of the `index_*` procedures in -// this package. -RECOMMENDED_SCAN_SIZE :: SCAN_REGISTER_SIZE +@(private, rodata) +simd_scanner_indices := #simd[SCAN_WIDTH]u8 { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, +} /* Scan a slice of bytes for a specific byte. -This procedure safely handles padding out slices of any length, including empty -slices. +This procedure safely handles slices of any length, including empty slices. Inputs: - data: A slice of bytes. @@ -36,83 +34,54 @@ Inputs: Returns: - index: The index of the byte `c`, or -1 if it was not found. */ -@(enable_target_feature="sse2") index_byte :: proc(data: []u8, c: byte) -> (index: int) #no_bounds_check { - scanner_data: [SCAN_REGISTER_SIZE]u8 = c - scanner := intrinsics.unaligned_load(cast(^x86.__m128i)&scanner_data[0]) - - i: int length := len(data) - full_chunks_length := length - length % SCAN_WIDTH + i := 0 - for /**/; i < full_chunks_length; i += SCAN_WIDTH { - simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&data[i]) - - #unroll for j in 0..> 63) = 0x0000_0000_0000_0001 - // - // The multiplication is a guard against zero. - // - submask = ~submask * (submask >> 63) - // - // Finally, mask out any irrelevant bits with the submask. - mask &= i32(submask) - - if mask != 0 { - ctz := int(intrinsics.count_trailing_zeros(mask)) - return i + j * SCAN_REGISTER_SIZE + ctz - } + for /**/; i < tail; i += SCAN_WIDTH { + load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^ + comparison := intrinsics.simd_lanes_eq(load, scanner) + match := intrinsics.simd_reduce_or(comparison) + if match > 0 { + sentinel: #simd[SCAN_WIDTH]u8 = u8(0xFF) + index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel) + index_reduce := intrinsics.simd_reduce_min(index_select) + return i + cast(int)index_reduce + } + } + + // Iterate as a scalar over the remaining unaligned portion. + for /**/; i < length; i += 1 { + if data[i] == c { + return i } } @@ -123,8 +92,7 @@ index_byte :: proc(data: []u8, c: byte) -> (index: int) #no_bounds_check { Scan a slice of bytes for a specific byte, starting from the end and working backwards to the start. -This procedure safely handles padding out slices of any length, including empty -slices. +This procedure safely handles slices of any length, including empty slices. Inputs: - data: A slice of bytes. @@ -133,54 +101,58 @@ Inputs: Returns: - index: The index of the byte `c`, or -1 if it was not found. */ -@(enable_target_feature="sse2") last_index_byte :: proc(data: []u8, c: byte) -> int #no_bounds_check { - scanner_data: [SCAN_REGISTER_SIZE]u8 = c - scanner := intrinsics.unaligned_load(cast(^x86.__m128i)&scanner_data[0]) + length := len(data) + i := length - 1 - i := len(data) - SCAN_WIDTH - - for /**/; i >= 0; i -= SCAN_WIDTH { - simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&data[i]) - - // There is no #reverse #unroll at the time of this writing, so we use - // `j` to count down by subtraction. - #unroll for j in 1..=SCAN_REGISTERS { - cmp := x86._mm_cmpeq_epi8(simd_load[SCAN_REGISTERS-j], scanner) - mask := x86._mm_movemask_epi8(cmp) - - if mask != 0 { - // CLZ is used instead to get the on-bit from the other end. - clz := (8 * size_of(mask) - 1) - int(intrinsics.count_leading_zeros(mask)) - return i + SCAN_WIDTH - j * SCAN_REGISTER_SIZE + clz + // Guard against small strings. + if length < SCAN_WIDTH { + for /**/; i >= 0; i -= 1 { + if data[i] == c { + return i } } + return -1 + } + + ptr := cast(int)cast(uintptr)raw_data(data) + + tail := length - (ptr + length) % SCAN_WIDTH + + // Iterate as a scalar until the data is aligned on a `SCAN_WIDTH` boundary. + // + // This way, every load in the vector loop will be aligned, which should be + // the fastest possible scenario. + for /**/; i >= tail; i -= 1 { + if data[i] == c { + return i + } } - if i < 0 { - padded_data_end: [SCAN_WIDTH]u8 = --- - remnant_length := len(data) % SCAN_WIDTH - intrinsics.mem_copy_non_overlapping( - &padded_data_end[0], - &raw_data(data)[0], - remnant_length, - ) + // Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level. + scanner: #simd[SCAN_WIDTH]u8 = c + alignment_start := (SCAN_WIDTH - ptr % SCAN_WIDTH) % SCAN_WIDTH - simd_load := intrinsics.unaligned_load(cast(^[SCAN_REGISTERS]x86.__m128i)&padded_data_end[0]) + i -= SCAN_WIDTH - 1 - #unroll for j in 1..=SCAN_REGISTERS { - cmp := x86._mm_cmpeq_epi8(simd_load[SCAN_REGISTERS-j], scanner) - mask := x86._mm_movemask_epi8(cmp) + for /**/; i >= alignment_start; i -= SCAN_WIDTH { + load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^ + comparison := intrinsics.simd_lanes_eq(load, scanner) + match := intrinsics.simd_reduce_or(comparison) + if match > 0 { + sentinel: #simd[SCAN_WIDTH]u8 + index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel) + index_reduce := intrinsics.simd_reduce_max(index_select) + return i + cast(int)index_reduce + } + } - submask := max(u64) << u64(remnant_length - (SCAN_REGISTERS-j) * SCAN_REGISTER_SIZE) - submask = ~submask * (submask >> 63) - - mask &= i32(submask) - - if mask != 0 { - clz := (8 * size_of(mask) - 1) - int(intrinsics.count_leading_zeros(mask)) - return SCAN_WIDTH - j * SCAN_REGISTER_SIZE + clz - } + // Iterate as a scalar over the remaining unaligned portion. + i += SCAN_WIDTH - 1 + + for /**/; i >= 0; i -= 1 { + if data[i] == c { + return i } } diff --git a/core/strings/strings.odin b/core/strings/strings.odin index 9d3e88165..b8e43f90d 100644 --- a/core/strings/strings.odin +++ b/core/strings/strings.odin @@ -1438,14 +1438,8 @@ index_byte :: proc(s: string, c: byte) -> (res: int) { // NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a // significant speedup when compiling in either Size or Speed mode. // The SIMD version is usually 2-3x slower without optimizations on. - when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") { - // SIMD's benefits are noticeable only past a certain threshold of data. - // For small data, use the plain old algorithm. - if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE { - return simd_util.index_byte(transmute([]u8)s, c) - } else { - return _index_byte(s, c) - } + when ODIN_OPTIMIZATION_MODE > .Minimal { + return #force_inline simd_util.index_byte(transmute([]u8)s, c) } else { return _index_byte(s, c) } @@ -1492,12 +1486,8 @@ last_index_byte :: proc(s: string, c: byte) -> (res: int) { return -1 } - when ODIN_OPTIMIZATION_MODE > .Minimal && intrinsics.has_target_feature("sse2") { - if len(s) >= simd_util.RECOMMENDED_SCAN_SIZE { - return simd_util.last_index_byte(transmute([]u8)s, c) - } else { - return _last_index_byte(s, c) - } + when ODIN_OPTIMIZATION_MODE > .Minimal { + return #force_inline simd_util.last_index_byte(transmute([]u8)s, c) } else { return _last_index_byte(s, c) } diff --git a/tests/benchmark/simd/util/benchmark_simd_util.odin b/tests/benchmark/simd/util/benchmark_simd_util.odin index 4538c6612..18fa0a9e3 100644 --- a/tests/benchmark/simd/util/benchmark_simd_util.odin +++ b/tests/benchmark/simd/util/benchmark_simd_util.odin @@ -1,4 +1,3 @@ -//+build i386, amd64 package benchmark_simd_util import "core:fmt" diff --git a/tests/core/simd/util/test_core_simd_util.odin b/tests/core/simd/util/test_core_simd_util.odin index 65bf566c0..ff7e1f9aa 100644 --- a/tests/core/simd/util/test_core_simd_util.odin +++ b/tests/core/simd/util/test_core_simd_util.odin @@ -1,4 +1,3 @@ -//+build i386, amd64 package test_core_simd_util import simd_util "core:simd/util" From c8a62ee4ec9b7beec6dcff907ad2dfecdd547f22 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Fri, 9 Aug 2024 17:42:14 -0400 Subject: [PATCH 07/13] Make `simd_util` index procs `contextless` where applicable --- core/bytes/bytes.odin | 4 ++-- core/simd/util/util.odin | 4 ++-- core/strings/strings.odin | 4 ++-- tests/benchmark/simd/util/benchmark_simd_util.odin | 6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin index 136c98f6b..4edd089b9 100644 --- a/core/bytes/bytes.odin +++ b/core/bytes/bytes.odin @@ -297,7 +297,7 @@ split_after_iterator :: proc(s: ^[]byte, sep: []byte) -> ([]byte, bool) { index_byte :: proc(s: []byte, c: byte) -> int { - _index_byte :: #force_inline proc(s: []byte, c: byte) -> int { + _index_byte :: #force_inline proc "contextless" (s: []byte, c: byte) -> int { for i := 0; i < len(s); i += 1 { if s[i] == c { return i @@ -318,7 +318,7 @@ index_byte :: proc(s: []byte, c: byte) -> int { // Returns -1 if c is not present last_index_byte :: proc(s: []byte, c: byte) -> int { - _last_index_byte :: #force_inline proc(s: []byte, c: byte) -> int { + _last_index_byte :: #force_inline proc "contextless" (s: []byte, c: byte) -> int { for i := len(s)-1; i >= 0; i -= 1 { if s[i] == c { return i diff --git a/core/simd/util/util.odin b/core/simd/util/util.odin index b209a44ea..74401689a 100644 --- a/core/simd/util/util.odin +++ b/core/simd/util/util.odin @@ -34,7 +34,7 @@ Inputs: Returns: - index: The index of the byte `c`, or -1 if it was not found. */ -index_byte :: proc(data: []u8, c: byte) -> (index: int) #no_bounds_check { +index_byte :: proc "contextless" (data: []u8, c: byte) -> (index: int) #no_bounds_check { length := len(data) i := 0 @@ -101,7 +101,7 @@ Inputs: Returns: - index: The index of the byte `c`, or -1 if it was not found. */ -last_index_byte :: proc(data: []u8, c: byte) -> int #no_bounds_check { +last_index_byte :: proc "contextless" (data: []u8, c: byte) -> int #no_bounds_check { length := len(data) i := length - 1 diff --git a/core/strings/strings.odin b/core/strings/strings.odin index b8e43f90d..ed7f494ae 100644 --- a/core/strings/strings.odin +++ b/core/strings/strings.odin @@ -1426,7 +1426,7 @@ Output: */ index_byte :: proc(s: string, c: byte) -> (res: int) { - _index_byte :: #force_inline proc(s: string, c: byte) -> int { + _index_byte :: #force_inline proc "contextless" (s: string, c: byte) -> int { for i := 0; i < len(s); i += 1 { if s[i] == c { return i @@ -1477,7 +1477,7 @@ Output: */ last_index_byte :: proc(s: string, c: byte) -> (res: int) { - _last_index_byte :: #force_inline proc(s: string, c: byte) -> int { + _last_index_byte :: #force_inline proc "contextless" (s: string, c: byte) -> int { for i := len(s)-1; i >= 0; i -= 1 { if s[i] == c { return i diff --git a/tests/benchmark/simd/util/benchmark_simd_util.odin b/tests/benchmark/simd/util/benchmark_simd_util.odin index 18fa0a9e3..e2187ce45 100644 --- a/tests/benchmark/simd/util/benchmark_simd_util.odin +++ b/tests/benchmark/simd/util/benchmark_simd_util.odin @@ -9,7 +9,7 @@ import "core:time" // These are the normal, unoptimized algorithms. -plain_index_byte :: proc(s: []u8, c: byte) -> (res: int) #no_bounds_check { +plain_index_byte :: proc "contextless" (s: []u8, c: byte) -> (res: int) #no_bounds_check { for i := 0; i < len(s); i += 1 { if s[i] == c { return i @@ -18,7 +18,7 @@ plain_index_byte :: proc(s: []u8, c: byte) -> (res: int) #no_bounds_check { return -1 } -plain_last_index_byte :: proc(s: []u8, c: byte) -> (res: int) #no_bounds_check { +plain_last_index_byte :: proc "contextless" (s: []u8, c: byte) -> (res: int) #no_bounds_check { for i := len(s)-1; i >= 0; i -= 1 { if s[i] == c { return i @@ -37,7 +37,7 @@ sizes := [?]int { 1024 * 1024 * 1024, } -run_trial_size :: proc(p: proc([]u8, byte) -> int, size: int, idx: int, warmup: int, runs: int) -> (timing: time.Duration) { +run_trial_size :: proc(p: proc "contextless" ([]u8, byte) -> int, size: int, idx: int, warmup: int, runs: int) -> (timing: time.Duration) { data := make([]u8, size) defer delete(data) From 0d29cc3375d4f20e122df23726cc526f96bf3305 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Fri, 9 Aug 2024 17:46:47 -0400 Subject: [PATCH 08/13] Use `for x in y` construct for `bytes` iteration This cannot be applied to the `strings` version, as that would cause a rune-by-rune iteration, not a byte-by-byte one. --- core/bytes/bytes.odin | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin index 4edd089b9..e130502a1 100644 --- a/core/bytes/bytes.odin +++ b/core/bytes/bytes.odin @@ -298,8 +298,8 @@ split_after_iterator :: proc(s: ^[]byte, sep: []byte) -> ([]byte, bool) { index_byte :: proc(s: []byte, c: byte) -> int { _index_byte :: #force_inline proc "contextless" (s: []byte, c: byte) -> int { - for i := 0; i < len(s); i += 1 { - if s[i] == c { + for ch, i in s { + if ch == c { return i } } @@ -319,8 +319,8 @@ index_byte :: proc(s: []byte, c: byte) -> int { // Returns -1 if c is not present last_index_byte :: proc(s: []byte, c: byte) -> int { _last_index_byte :: #force_inline proc "contextless" (s: []byte, c: byte) -> int { - for i := len(s)-1; i >= 0; i -= 1 { - if s[i] == c { + #reverse for ch, i in s { + if ch == c { return i } } From e7e7fe766a2e9171b7edff58cfdf889a41e1094e Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Fri, 9 Aug 2024 17:47:27 -0400 Subject: [PATCH 09/13] Add test for misaligned data to `core:simd/util` suite --- tests/core/simd/util/test_core_simd_util.odin | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/core/simd/util/test_core_simd_util.odin b/tests/core/simd/util/test_core_simd_util.odin index ff7e1f9aa..ba302a121 100644 --- a/tests/core/simd/util/test_core_simd_util.odin +++ b/tests/core/simd/util/test_core_simd_util.odin @@ -105,3 +105,37 @@ test_index_byte_zero :: proc(t: ^testing.T) { } } } + +@test +test_misaligned_data :: proc(t: ^testing.T) { + for n in 2..<256 { + data := make([]u8, n) + defer delete(data) + for i in 0.. Date: Sat, 10 Aug 2024 07:17:03 -0400 Subject: [PATCH 10/13] Merge `core:simd/util` into `core:bytes` --- core/bytes/bytes.odin | 151 ++++++++++++++--- core/simd/util/util.odin | 160 ------------------ core/strings/strings.odin | 35 +--- examples/all/all_main.odin | 2 - tests/benchmark/all.odin | 2 +- .../benchmark_bytes.odin} | 18 +- .../test_core_bytes.odin} | 40 ++--- tests/core/normal.odin | 2 +- 8 files changed, 164 insertions(+), 246 deletions(-) delete mode 100644 core/simd/util/util.odin rename tests/benchmark/{simd/util/benchmark_simd_util.odin => bytes/benchmark_bytes.odin} (75%) rename tests/core/{simd/util/test_core_simd_util.odin => bytes/test_core_bytes.odin} (54%) diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin index e130502a1..e09859a19 100644 --- a/core/bytes/bytes.odin +++ b/core/bytes/bytes.odin @@ -2,10 +2,21 @@ package bytes import "base:intrinsics" import "core:mem" -@require import simd_util "core:simd/util" import "core:unicode" import "core:unicode/utf8" + +@private SIMD_SCAN_WIDTH :: 32 + +@(private, rodata) +simd_scanner_indices := #simd[SIMD_SCAN_WIDTH]u8 { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, +} + + clone :: proc(s: []byte, allocator := context.allocator, loc := #caller_location) -> []byte { c := make([]byte, len(s), allocator, loc) copy(c, s) @@ -295,43 +306,141 @@ split_after_iterator :: proc(s: ^[]byte, sep: []byte) -> ([]byte, bool) { return _split_iterator(s, sep, len(sep)) } +/* +Scan a slice of bytes for a specific byte. -index_byte :: proc(s: []byte, c: byte) -> int { - _index_byte :: #force_inline proc "contextless" (s: []byte, c: byte) -> int { - for ch, i in s { - if ch == c { +This procedure safely handles slices of any length, including empty slices. + +Inputs: +- data: A slice of bytes. +- c: The byte to search for. + +Returns: +- index: The index of the byte `c`, or -1 if it was not found. +*/ +index_byte :: proc(s: []byte, c: byte) -> (index: int) #no_bounds_check { + length := len(s) + i := 0 + + // Guard against small strings. + if length < SIMD_SCAN_WIDTH { + for /**/; i < length; i += 1 { + if s[i] == c { return i } } return -1 } - // NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a - // significant speedup when compiling in either Size or Speed mode. - // The SIMD version is usually 2-3x slower without optimizations on. - when ODIN_OPTIMIZATION_MODE > .Minimal { - return #force_inline simd_util.index_byte(s, c) - } else { - return _index_byte(s, c) + ptr := cast(int)cast(uintptr)raw_data(s) + + alignment_start := (SIMD_SCAN_WIDTH - ptr % SIMD_SCAN_WIDTH) % SIMD_SCAN_WIDTH + + // Iterate as a scalar until the data is aligned on a `SIMD_SCAN_WIDTH` boundary. + // + // This way, every load in the vector loop will be aligned, which should be + // the fastest possible scenario. + for /**/; i < alignment_start; i += 1 { + if s[i] == c { + return i + } } + + // Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level. + scanner: #simd[SIMD_SCAN_WIDTH]u8 = c + tail := length - (length - alignment_start) % SIMD_SCAN_WIDTH + + for /**/; i < tail; i += SIMD_SCAN_WIDTH { + load := (cast(^#simd[SIMD_SCAN_WIDTH]u8)(&s[i]))^ + comparison := intrinsics.simd_lanes_eq(load, scanner) + match := intrinsics.simd_reduce_or(comparison) + if match > 0 { + sentinel: #simd[SIMD_SCAN_WIDTH]u8 = u8(0xFF) + index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel) + index_reduce := intrinsics.simd_reduce_min(index_select) + return i + cast(int)index_reduce + } + } + + // Iterate as a scalar over the remaining unaligned portion. + for /**/; i < length; i += 1 { + if s[i] == c { + return i + } + } + + return -1 } -// Returns -1 if c is not present -last_index_byte :: proc(s: []byte, c: byte) -> int { - _last_index_byte :: #force_inline proc "contextless" (s: []byte, c: byte) -> int { - #reverse for ch, i in s { - if ch == c { +/* +Scan a slice of bytes for a specific byte, starting from the end and working +backwards to the start. + +This procedure safely handles slices of any length, including empty slices. + +Inputs: +- data: A slice of bytes. +- c: The byte to search for. + +Returns: +- index: The index of the byte `c`, or -1 if it was not found. +*/ +last_index_byte :: proc(s: []byte, c: byte) -> int #no_bounds_check { + length := len(s) + i := length - 1 + + // Guard against small strings. + if length < SIMD_SCAN_WIDTH { + for /**/; i >= 0; i -= 1 { + if s[i] == c { return i } } return -1 } - when ODIN_OPTIMIZATION_MODE > .Minimal { - return #force_inline simd_util.last_index_byte(s, c) - } else { - return _last_index_byte(s, c) + ptr := cast(int)cast(uintptr)raw_data(s) + + tail := length - (ptr + length) % SIMD_SCAN_WIDTH + + // Iterate as a scalar until the data is aligned on a `SIMD_SCAN_WIDTH` boundary. + // + // This way, every load in the vector loop will be aligned, which should be + // the fastest possible scenario. + for /**/; i >= tail; i -= 1 { + if s[i] == c { + return i + } } + + // Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level. + scanner: #simd[SIMD_SCAN_WIDTH]u8 = c + alignment_start := (SIMD_SCAN_WIDTH - ptr % SIMD_SCAN_WIDTH) % SIMD_SCAN_WIDTH + + i -= SIMD_SCAN_WIDTH - 1 + + for /**/; i >= alignment_start; i -= SIMD_SCAN_WIDTH { + load := (cast(^#simd[SIMD_SCAN_WIDTH]u8)(&s[i]))^ + comparison := intrinsics.simd_lanes_eq(load, scanner) + match := intrinsics.simd_reduce_or(comparison) + if match > 0 { + sentinel: #simd[SIMD_SCAN_WIDTH]u8 + index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel) + index_reduce := intrinsics.simd_reduce_max(index_select) + return i + cast(int)index_reduce + } + } + + // Iterate as a scalar over the remaining unaligned portion. + i += SIMD_SCAN_WIDTH - 1 + + for /**/; i >= 0; i -= 1 { + if s[i] == c { + return i + } + } + + return -1 } diff --git a/core/simd/util/util.odin b/core/simd/util/util.odin deleted file mode 100644 index 74401689a..000000000 --- a/core/simd/util/util.odin +++ /dev/null @@ -1,160 +0,0 @@ -/* - (c) Copyright 2024 Feoramund . - Made available under Odin's BSD-3 license. - - List of contributors: - Feoramund: `index_byte` procedures. -*/ - -// package simd_util implements compositions of SIMD operations for optimizing -// the core library where available. -package simd_util - -import "base:intrinsics" - -@private SCAN_WIDTH :: 32 - -@(private, rodata) -simd_scanner_indices := #simd[SCAN_WIDTH]u8 { - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31, -} - -/* -Scan a slice of bytes for a specific byte. - -This procedure safely handles slices of any length, including empty slices. - -Inputs: -- data: A slice of bytes. -- c: The byte to search for. - -Returns: -- index: The index of the byte `c`, or -1 if it was not found. -*/ -index_byte :: proc "contextless" (data: []u8, c: byte) -> (index: int) #no_bounds_check { - length := len(data) - i := 0 - - // Guard against small strings. - if length < SCAN_WIDTH { - for /**/; i < length; i += 1 { - if data[i] == c { - return i - } - } - return -1 - } - - ptr := cast(int)cast(uintptr)raw_data(data) - - alignment_start := (SCAN_WIDTH - ptr % SCAN_WIDTH) % SCAN_WIDTH - - // Iterate as a scalar until the data is aligned on a `SCAN_WIDTH` boundary. - // - // This way, every load in the vector loop will be aligned, which should be - // the fastest possible scenario. - for /**/; i < alignment_start; i += 1 { - if data[i] == c { - return i - } - } - - // Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level. - scanner: #simd[SCAN_WIDTH]u8 = c - tail := length - (length - alignment_start) % SCAN_WIDTH - - for /**/; i < tail; i += SCAN_WIDTH { - load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^ - comparison := intrinsics.simd_lanes_eq(load, scanner) - match := intrinsics.simd_reduce_or(comparison) - if match > 0 { - sentinel: #simd[SCAN_WIDTH]u8 = u8(0xFF) - index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel) - index_reduce := intrinsics.simd_reduce_min(index_select) - return i + cast(int)index_reduce - } - } - - // Iterate as a scalar over the remaining unaligned portion. - for /**/; i < length; i += 1 { - if data[i] == c { - return i - } - } - - return -1 -} - -/* -Scan a slice of bytes for a specific byte, starting from the end and working -backwards to the start. - -This procedure safely handles slices of any length, including empty slices. - -Inputs: -- data: A slice of bytes. -- c: The byte to search for. - -Returns: -- index: The index of the byte `c`, or -1 if it was not found. -*/ -last_index_byte :: proc "contextless" (data: []u8, c: byte) -> int #no_bounds_check { - length := len(data) - i := length - 1 - - // Guard against small strings. - if length < SCAN_WIDTH { - for /**/; i >= 0; i -= 1 { - if data[i] == c { - return i - } - } - return -1 - } - - ptr := cast(int)cast(uintptr)raw_data(data) - - tail := length - (ptr + length) % SCAN_WIDTH - - // Iterate as a scalar until the data is aligned on a `SCAN_WIDTH` boundary. - // - // This way, every load in the vector loop will be aligned, which should be - // the fastest possible scenario. - for /**/; i >= tail; i -= 1 { - if data[i] == c { - return i - } - } - - // Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level. - scanner: #simd[SCAN_WIDTH]u8 = c - alignment_start := (SCAN_WIDTH - ptr % SCAN_WIDTH) % SCAN_WIDTH - - i -= SCAN_WIDTH - 1 - - for /**/; i >= alignment_start; i -= SCAN_WIDTH { - load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^ - comparison := intrinsics.simd_lanes_eq(load, scanner) - match := intrinsics.simd_reduce_or(comparison) - if match > 0 { - sentinel: #simd[SCAN_WIDTH]u8 - index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel) - index_reduce := intrinsics.simd_reduce_max(index_select) - return i + cast(int)index_reduce - } - } - - // Iterate as a scalar over the remaining unaligned portion. - i += SCAN_WIDTH - 1 - - for /**/; i >= 0; i -= 1 { - if data[i] == c { - return i - } - } - - return -1 -} diff --git a/core/strings/strings.odin b/core/strings/strings.odin index ed7f494ae..be4275e8b 100644 --- a/core/strings/strings.odin +++ b/core/strings/strings.odin @@ -2,8 +2,8 @@ package strings import "base:intrinsics" +import "core:bytes" import "core:io" -@require import simd_util "core:simd/util" import "core:mem" import "core:unicode" import "core:unicode/utf8" @@ -1426,23 +1426,7 @@ Output: */ index_byte :: proc(s: string, c: byte) -> (res: int) { - _index_byte :: #force_inline proc "contextless" (s: string, c: byte) -> int { - for i := 0; i < len(s); i += 1 { - if s[i] == c { - return i - } - } - return -1 - } - - // NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a - // significant speedup when compiling in either Size or Speed mode. - // The SIMD version is usually 2-3x slower without optimizations on. - when ODIN_OPTIMIZATION_MODE > .Minimal { - return #force_inline simd_util.index_byte(transmute([]u8)s, c) - } else { - return _index_byte(s, c) - } + return #force_inline bytes.index_byte(transmute([]u8)s, c) } /* Returns the byte offset of the last byte `c` in the string `s`, -1 when not found. @@ -1477,20 +1461,7 @@ Output: */ last_index_byte :: proc(s: string, c: byte) -> (res: int) { - _last_index_byte :: #force_inline proc "contextless" (s: string, c: byte) -> int { - for i := len(s)-1; i >= 0; i -= 1 { - if s[i] == c { - return i - } - } - return -1 - } - - when ODIN_OPTIMIZATION_MODE > .Minimal { - return #force_inline simd_util.last_index_byte(transmute([]u8)s, c) - } else { - return _last_index_byte(s, c) - } + return #force_inline bytes.last_index_byte(transmute([]u8)s, c) } /* Returns the byte offset of the first rune `r` in the string `s` it finds, -1 when not found. diff --git a/examples/all/all_main.odin b/examples/all/all_main.odin index 43ea0de98..d92a6b8c4 100644 --- a/examples/all/all_main.odin +++ b/examples/all/all_main.odin @@ -115,7 +115,6 @@ import relative "core:relative" import reflect "core:reflect" import runtime "base:runtime" import simd "core:simd" -import simd_util "core:simd/util" import x86 "core:simd/x86" import slice "core:slice" import slice_heap "core:slice/heap" @@ -238,7 +237,6 @@ _ :: relative _ :: reflect _ :: runtime _ :: simd -_ :: simd_util _ :: x86 _ :: slice _ :: slice_heap diff --git a/tests/benchmark/all.odin b/tests/benchmark/all.odin index 357d86f67..4fdf82a49 100644 --- a/tests/benchmark/all.odin +++ b/tests/benchmark/all.odin @@ -1,5 +1,5 @@ package benchmarks +@(require) import "bytes" @(require) import "crypto" @(require) import "hash" -@(require) import "simd/util" diff --git a/tests/benchmark/simd/util/benchmark_simd_util.odin b/tests/benchmark/bytes/benchmark_bytes.odin similarity index 75% rename from tests/benchmark/simd/util/benchmark_simd_util.odin rename to tests/benchmark/bytes/benchmark_bytes.odin index e2187ce45..d303e81dd 100644 --- a/tests/benchmark/simd/util/benchmark_simd_util.odin +++ b/tests/benchmark/bytes/benchmark_bytes.odin @@ -1,15 +1,15 @@ -package benchmark_simd_util +package benchmark_bytes +import "core:bytes" import "core:fmt" import "core:log" -import simd_util "core:simd/util" import "core:testing" import "core:time" // These are the normal, unoptimized algorithms. -plain_index_byte :: proc "contextless" (s: []u8, c: byte) -> (res: int) #no_bounds_check { +plain_index_byte :: proc(s: []u8, c: byte) -> (res: int) #no_bounds_check { for i := 0; i < len(s); i += 1 { if s[i] == c { return i @@ -18,7 +18,7 @@ plain_index_byte :: proc "contextless" (s: []u8, c: byte) -> (res: int) #no_boun return -1 } -plain_last_index_byte :: proc "contextless" (s: []u8, c: byte) -> (res: int) #no_bounds_check { +plain_last_index_byte :: proc(s: []u8, c: byte) -> (res: int) #no_bounds_check { for i := len(s)-1; i >= 0; i -= 1 { if s[i] == c { return i @@ -37,7 +37,7 @@ sizes := [?]int { 1024 * 1024 * 1024, } -run_trial_size :: proc(p: proc "contextless" ([]u8, byte) -> int, size: int, idx: int, warmup: int, runs: int) -> (timing: time.Duration) { +run_trial_size :: proc(p: proc([]u8, byte) -> int, size: int, idx: int, warmup: int, runs: int) -> (timing: time.Duration) { data := make([]u8, size) defer delete(data) @@ -95,9 +95,9 @@ benchmark_plain_index_hot :: proc(t: ^testing.T) { benchmark_simd_index_cold :: proc(t: ^testing.T) { report: string for size in sizes { - timing := run_trial_size(simd_util.index_byte, size, size - 1, 0, 1) + timing := run_trial_size(bytes.index_byte, size, size - 1, 0, 1) report = fmt.tprintf("%s\n +++ % 8M | %v", report, size, timing) - timing = run_trial_size(simd_util.last_index_byte, size, 0, 0, 1) + timing = run_trial_size(bytes.last_index_byte, size, 0, 0, 1) report = fmt.tprintf("%s\n (last) +++ % 8M | %v", report, size, timing) } log.info(report) @@ -107,9 +107,9 @@ benchmark_simd_index_cold :: proc(t: ^testing.T) { benchmark_simd_index_hot :: proc(t: ^testing.T) { report: string for size in sizes { - timing := run_trial_size(simd_util.index_byte, size, size - 1, HOT, HOT) + timing := run_trial_size(bytes.index_byte, size, size - 1, HOT, HOT) report = fmt.tprintf("%s\n +++ % 8M | %v", report, size, timing) - timing = run_trial_size(simd_util.last_index_byte, size, 0, HOT, HOT) + timing = run_trial_size(bytes.last_index_byte, size, 0, HOT, HOT) report = fmt.tprintf("%s\n (last) +++ % 8M | %v", report, size, timing) } log.info(report) diff --git a/tests/core/simd/util/test_core_simd_util.odin b/tests/core/bytes/test_core_bytes.odin similarity index 54% rename from tests/core/simd/util/test_core_simd_util.odin rename to tests/core/bytes/test_core_bytes.odin index ba302a121..9074c0205 100644 --- a/tests/core/simd/util/test_core_simd_util.odin +++ b/tests/core/bytes/test_core_bytes.odin @@ -1,6 +1,6 @@ -package test_core_simd_util +package test_core_bytes -import simd_util "core:simd/util" +import "core:bytes" import "core:testing" @test @@ -15,30 +15,30 @@ test_index_byte_sanity :: proc(t: ^testing.T) { // Find it at the end. data[n-1] = 'o' - if !testing.expect_value(t, simd_util.index_byte(data, 'o'), n-1) { + if !testing.expect_value(t, bytes.index_byte(data, 'o'), n-1) { return } - if !testing.expect_value(t, simd_util.last_index_byte(data, 'o'), n-1) { + if !testing.expect_value(t, bytes.last_index_byte(data, 'o'), n-1) { return } data[n-1] = '-' // Find it in the middle. data[n/2] = 'o' - if !testing.expect_value(t, simd_util.index_byte(data, 'o'), n/2) { + if !testing.expect_value(t, bytes.index_byte(data, 'o'), n/2) { return } - if !testing.expect_value(t, simd_util.last_index_byte(data, 'o'), n/2) { + if !testing.expect_value(t, bytes.last_index_byte(data, 'o'), n/2) { return } data[n/2] = '-' // Find it at the start. data[0] = 'o' - if !testing.expect_value(t, simd_util.index_byte(data, 'o'), 0) { + if !testing.expect_value(t, bytes.index_byte(data, 'o'), 0) { return } - if !testing.expect_value(t, simd_util.last_index_byte(data, 'o'), 0) { + if !testing.expect_value(t, bytes.last_index_byte(data, 'o'), 0) { return } } @@ -47,8 +47,8 @@ test_index_byte_sanity :: proc(t: ^testing.T) { @test test_index_byte_empty :: proc(t: ^testing.T) { a: [1]u8 - testing.expect_value(t, simd_util.index_byte(a[0:0], 'o'), -1) - testing.expect_value(t, simd_util.last_index_byte(a[0:0], 'o'), -1) + testing.expect_value(t, bytes.index_byte(a[0:0], 'o'), -1) + testing.expect_value(t, bytes.last_index_byte(a[0:0], 'o'), -1) } @test @@ -65,12 +65,12 @@ test_index_byte_multiple_hits :: proc(t: ^testing.T) { data[n-5] = 'o' // Find the first one. - if !testing.expect_value(t, simd_util.index_byte(data, 'o'), n-5) { + if !testing.expect_value(t, bytes.index_byte(data, 'o'), n-5) { return } // Find the last one. - if !testing.expect_value(t, simd_util.last_index_byte(data, 'o'), n-1) { + if !testing.expect_value(t, bytes.last_index_byte(data, 'o'), n-1) { return } } @@ -88,19 +88,19 @@ test_index_byte_zero :: proc(t: ^testing.T) { // Positive hit. data[n-1] = 0 - if !testing.expect_value(t, simd_util.index_byte(data[:n], 0), n-1) { + if !testing.expect_value(t, bytes.index_byte(data[:n], 0), n-1) { return } - if !testing.expect_value(t, simd_util.last_index_byte(data[:n], 0), n-1) { + if !testing.expect_value(t, bytes.last_index_byte(data[:n], 0), n-1) { return } // Test for false positives. data[n-1] = '-' - if !testing.expect_value(t, simd_util.index_byte(data[:n], 0), -1) { + if !testing.expect_value(t, bytes.index_byte(data[:n], 0), -1) { return } - if !testing.expect_value(t, simd_util.last_index_byte(data[:n], 0), -1) { + if !testing.expect_value(t, bytes.last_index_byte(data[:n], 0), -1) { return } } @@ -117,22 +117,22 @@ test_misaligned_data :: proc(t: ^testing.T) { for m in 1.. Date: Sat, 10 Aug 2024 07:18:49 -0400 Subject: [PATCH 11/13] Set `SIMD_SCAN_WIDTH` based on `size_of(uintptr)` --- core/bytes/bytes.odin | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin index e09859a19..8e7bc01bd 100644 --- a/core/bytes/bytes.odin +++ b/core/bytes/bytes.odin @@ -6,14 +6,30 @@ import "core:unicode" import "core:unicode/utf8" -@private SIMD_SCAN_WIDTH :: 32 +@private SIMD_SCAN_WIDTH :: 8 * size_of(uintptr) -@(private, rodata) -simd_scanner_indices := #simd[SIMD_SCAN_WIDTH]u8 { - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31, +when SIMD_SCAN_WIDTH == 32 { + @(private, rodata) + simd_scanner_indices := #simd[SIMD_SCAN_WIDTH]u8 { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + } +} else when SIMD_SCAN_WIDTH == 64 { + @(private, rodata) + simd_scanner_indices := #simd[SIMD_SCAN_WIDTH]u8 { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + } +} else { + #panic("Invalid SIMD_SCAN_WIDTH. Must be 32 or 64.") } From 9d2b4b2f03a30e296e55f0f5f9ce33e20303f55b Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Sat, 10 Aug 2024 08:13:22 -0400 Subject: [PATCH 12/13] Simplify `core:bytes` test --- tests/core/bytes/test_core_bytes.odin | 90 +++++---------------------- 1 file changed, 17 insertions(+), 73 deletions(-) diff --git a/tests/core/bytes/test_core_bytes.odin b/tests/core/bytes/test_core_bytes.odin index 9074c0205..72390291f 100644 --- a/tests/core/bytes/test_core_bytes.odin +++ b/tests/core/bytes/test_core_bytes.odin @@ -1,45 +1,27 @@ package test_core_bytes import "core:bytes" +import "core:slice" import "core:testing" @test test_index_byte_sanity :: proc(t: ^testing.T) { // We must be able to find the byte at the correct index. - for n in 1..<256 { - data := make([]u8, n) - defer delete(data) - for i in 0.. Date: Sat, 10 Aug 2024 13:51:18 -0400 Subject: [PATCH 13/13] Use `SIMD_SCAN_WIDTH` constant in `core:bytes` test --- tests/core/bytes/test_core_bytes.odin | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/core/bytes/test_core_bytes.odin b/tests/core/bytes/test_core_bytes.odin index 72390291f..fb3c460aa 100644 --- a/tests/core/bytes/test_core_bytes.odin +++ b/tests/core/bytes/test_core_bytes.odin @@ -4,15 +4,19 @@ import "core:bytes" import "core:slice" import "core:testing" +@private SIMD_SCAN_WIDTH :: 8 * size_of(uintptr) + @test test_index_byte_sanity :: proc(t: ^testing.T) { // We must be able to find the byte at the correct index. - data := make([]u8, 64) + data := make([]u8, 2 * SIMD_SCAN_WIDTH) defer delete(data) slice.fill(data, '-') - for offset in 0..<31 { - for idx in 0..<31 { + INDEX_MAX :: SIMD_SCAN_WIDTH - 1 + + for offset in 0..