diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin index e130502a1..e09859a19 100644 --- a/core/bytes/bytes.odin +++ b/core/bytes/bytes.odin @@ -2,10 +2,21 @@ package bytes import "base:intrinsics" import "core:mem" -@require import simd_util "core:simd/util" import "core:unicode" import "core:unicode/utf8" + +@private SIMD_SCAN_WIDTH :: 32 + +@(private, rodata) +simd_scanner_indices := #simd[SIMD_SCAN_WIDTH]u8 { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, +} + + clone :: proc(s: []byte, allocator := context.allocator, loc := #caller_location) -> []byte { c := make([]byte, len(s), allocator, loc) copy(c, s) @@ -295,43 +306,141 @@ split_after_iterator :: proc(s: ^[]byte, sep: []byte) -> ([]byte, bool) { return _split_iterator(s, sep, len(sep)) } +/* +Scan a slice of bytes for a specific byte. -index_byte :: proc(s: []byte, c: byte) -> int { - _index_byte :: #force_inline proc "contextless" (s: []byte, c: byte) -> int { - for ch, i in s { - if ch == c { +This procedure safely handles slices of any length, including empty slices. + +Inputs: +- data: A slice of bytes. +- c: The byte to search for. + +Returns: +- index: The index of the byte `c`, or -1 if it was not found. +*/ +index_byte :: proc(s: []byte, c: byte) -> (index: int) #no_bounds_check { + length := len(s) + i := 0 + + // Guard against small strings. + if length < SIMD_SCAN_WIDTH { + for /**/; i < length; i += 1 { + if s[i] == c { return i } } return -1 } - // NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a - // significant speedup when compiling in either Size or Speed mode. - // The SIMD version is usually 2-3x slower without optimizations on. - when ODIN_OPTIMIZATION_MODE > .Minimal { - return #force_inline simd_util.index_byte(s, c) - } else { - return _index_byte(s, c) + ptr := cast(int)cast(uintptr)raw_data(s) + + alignment_start := (SIMD_SCAN_WIDTH - ptr % SIMD_SCAN_WIDTH) % SIMD_SCAN_WIDTH + + // Iterate as a scalar until the data is aligned on a `SIMD_SCAN_WIDTH` boundary. + // + // This way, every load in the vector loop will be aligned, which should be + // the fastest possible scenario. + for /**/; i < alignment_start; i += 1 { + if s[i] == c { + return i + } } + + // Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level. + scanner: #simd[SIMD_SCAN_WIDTH]u8 = c + tail := length - (length - alignment_start) % SIMD_SCAN_WIDTH + + for /**/; i < tail; i += SIMD_SCAN_WIDTH { + load := (cast(^#simd[SIMD_SCAN_WIDTH]u8)(&s[i]))^ + comparison := intrinsics.simd_lanes_eq(load, scanner) + match := intrinsics.simd_reduce_or(comparison) + if match > 0 { + sentinel: #simd[SIMD_SCAN_WIDTH]u8 = u8(0xFF) + index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel) + index_reduce := intrinsics.simd_reduce_min(index_select) + return i + cast(int)index_reduce + } + } + + // Iterate as a scalar over the remaining unaligned portion. + for /**/; i < length; i += 1 { + if s[i] == c { + return i + } + } + + return -1 } -// Returns -1 if c is not present -last_index_byte :: proc(s: []byte, c: byte) -> int { - _last_index_byte :: #force_inline proc "contextless" (s: []byte, c: byte) -> int { - #reverse for ch, i in s { - if ch == c { +/* +Scan a slice of bytes for a specific byte, starting from the end and working +backwards to the start. + +This procedure safely handles slices of any length, including empty slices. + +Inputs: +- data: A slice of bytes. +- c: The byte to search for. + +Returns: +- index: The index of the byte `c`, or -1 if it was not found. +*/ +last_index_byte :: proc(s: []byte, c: byte) -> int #no_bounds_check { + length := len(s) + i := length - 1 + + // Guard against small strings. + if length < SIMD_SCAN_WIDTH { + for /**/; i >= 0; i -= 1 { + if s[i] == c { return i } } return -1 } - when ODIN_OPTIMIZATION_MODE > .Minimal { - return #force_inline simd_util.last_index_byte(s, c) - } else { - return _last_index_byte(s, c) + ptr := cast(int)cast(uintptr)raw_data(s) + + tail := length - (ptr + length) % SIMD_SCAN_WIDTH + + // Iterate as a scalar until the data is aligned on a `SIMD_SCAN_WIDTH` boundary. + // + // This way, every load in the vector loop will be aligned, which should be + // the fastest possible scenario. + for /**/; i >= tail; i -= 1 { + if s[i] == c { + return i + } } + + // Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level. + scanner: #simd[SIMD_SCAN_WIDTH]u8 = c + alignment_start := (SIMD_SCAN_WIDTH - ptr % SIMD_SCAN_WIDTH) % SIMD_SCAN_WIDTH + + i -= SIMD_SCAN_WIDTH - 1 + + for /**/; i >= alignment_start; i -= SIMD_SCAN_WIDTH { + load := (cast(^#simd[SIMD_SCAN_WIDTH]u8)(&s[i]))^ + comparison := intrinsics.simd_lanes_eq(load, scanner) + match := intrinsics.simd_reduce_or(comparison) + if match > 0 { + sentinel: #simd[SIMD_SCAN_WIDTH]u8 + index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel) + index_reduce := intrinsics.simd_reduce_max(index_select) + return i + cast(int)index_reduce + } + } + + // Iterate as a scalar over the remaining unaligned portion. + i += SIMD_SCAN_WIDTH - 1 + + for /**/; i >= 0; i -= 1 { + if s[i] == c { + return i + } + } + + return -1 } diff --git a/core/simd/util/util.odin b/core/simd/util/util.odin deleted file mode 100644 index 74401689a..000000000 --- a/core/simd/util/util.odin +++ /dev/null @@ -1,160 +0,0 @@ -/* - (c) Copyright 2024 Feoramund . - Made available under Odin's BSD-3 license. - - List of contributors: - Feoramund: `index_byte` procedures. -*/ - -// package simd_util implements compositions of SIMD operations for optimizing -// the core library where available. -package simd_util - -import "base:intrinsics" - -@private SCAN_WIDTH :: 32 - -@(private, rodata) -simd_scanner_indices := #simd[SCAN_WIDTH]u8 { - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31, -} - -/* -Scan a slice of bytes for a specific byte. - -This procedure safely handles slices of any length, including empty slices. - -Inputs: -- data: A slice of bytes. -- c: The byte to search for. - -Returns: -- index: The index of the byte `c`, or -1 if it was not found. -*/ -index_byte :: proc "contextless" (data: []u8, c: byte) -> (index: int) #no_bounds_check { - length := len(data) - i := 0 - - // Guard against small strings. - if length < SCAN_WIDTH { - for /**/; i < length; i += 1 { - if data[i] == c { - return i - } - } - return -1 - } - - ptr := cast(int)cast(uintptr)raw_data(data) - - alignment_start := (SCAN_WIDTH - ptr % SCAN_WIDTH) % SCAN_WIDTH - - // Iterate as a scalar until the data is aligned on a `SCAN_WIDTH` boundary. - // - // This way, every load in the vector loop will be aligned, which should be - // the fastest possible scenario. - for /**/; i < alignment_start; i += 1 { - if data[i] == c { - return i - } - } - - // Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level. - scanner: #simd[SCAN_WIDTH]u8 = c - tail := length - (length - alignment_start) % SCAN_WIDTH - - for /**/; i < tail; i += SCAN_WIDTH { - load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^ - comparison := intrinsics.simd_lanes_eq(load, scanner) - match := intrinsics.simd_reduce_or(comparison) - if match > 0 { - sentinel: #simd[SCAN_WIDTH]u8 = u8(0xFF) - index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel) - index_reduce := intrinsics.simd_reduce_min(index_select) - return i + cast(int)index_reduce - } - } - - // Iterate as a scalar over the remaining unaligned portion. - for /**/; i < length; i += 1 { - if data[i] == c { - return i - } - } - - return -1 -} - -/* -Scan a slice of bytes for a specific byte, starting from the end and working -backwards to the start. - -This procedure safely handles slices of any length, including empty slices. - -Inputs: -- data: A slice of bytes. -- c: The byte to search for. - -Returns: -- index: The index of the byte `c`, or -1 if it was not found. -*/ -last_index_byte :: proc "contextless" (data: []u8, c: byte) -> int #no_bounds_check { - length := len(data) - i := length - 1 - - // Guard against small strings. - if length < SCAN_WIDTH { - for /**/; i >= 0; i -= 1 { - if data[i] == c { - return i - } - } - return -1 - } - - ptr := cast(int)cast(uintptr)raw_data(data) - - tail := length - (ptr + length) % SCAN_WIDTH - - // Iterate as a scalar until the data is aligned on a `SCAN_WIDTH` boundary. - // - // This way, every load in the vector loop will be aligned, which should be - // the fastest possible scenario. - for /**/; i >= tail; i -= 1 { - if data[i] == c { - return i - } - } - - // Iterate as a vector over every aligned chunk, evaluating each byte simultaneously at the CPU level. - scanner: #simd[SCAN_WIDTH]u8 = c - alignment_start := (SCAN_WIDTH - ptr % SCAN_WIDTH) % SCAN_WIDTH - - i -= SCAN_WIDTH - 1 - - for /**/; i >= alignment_start; i -= SCAN_WIDTH { - load := (cast(^#simd[SCAN_WIDTH]u8)(&data[i]))^ - comparison := intrinsics.simd_lanes_eq(load, scanner) - match := intrinsics.simd_reduce_or(comparison) - if match > 0 { - sentinel: #simd[SCAN_WIDTH]u8 - index_select := intrinsics.simd_select(comparison, simd_scanner_indices, sentinel) - index_reduce := intrinsics.simd_reduce_max(index_select) - return i + cast(int)index_reduce - } - } - - // Iterate as a scalar over the remaining unaligned portion. - i += SCAN_WIDTH - 1 - - for /**/; i >= 0; i -= 1 { - if data[i] == c { - return i - } - } - - return -1 -} diff --git a/core/strings/strings.odin b/core/strings/strings.odin index ed7f494ae..be4275e8b 100644 --- a/core/strings/strings.odin +++ b/core/strings/strings.odin @@ -2,8 +2,8 @@ package strings import "base:intrinsics" +import "core:bytes" import "core:io" -@require import simd_util "core:simd/util" import "core:mem" import "core:unicode" import "core:unicode/utf8" @@ -1426,23 +1426,7 @@ Output: */ index_byte :: proc(s: string, c: byte) -> (res: int) { - _index_byte :: #force_inline proc "contextless" (s: string, c: byte) -> int { - for i := 0; i < len(s); i += 1 { - if s[i] == c { - return i - } - } - return -1 - } - - // NOTE(Feoramund): On my Alder Lake CPU, I have only witnessed a - // significant speedup when compiling in either Size or Speed mode. - // The SIMD version is usually 2-3x slower without optimizations on. - when ODIN_OPTIMIZATION_MODE > .Minimal { - return #force_inline simd_util.index_byte(transmute([]u8)s, c) - } else { - return _index_byte(s, c) - } + return #force_inline bytes.index_byte(transmute([]u8)s, c) } /* Returns the byte offset of the last byte `c` in the string `s`, -1 when not found. @@ -1477,20 +1461,7 @@ Output: */ last_index_byte :: proc(s: string, c: byte) -> (res: int) { - _last_index_byte :: #force_inline proc "contextless" (s: string, c: byte) -> int { - for i := len(s)-1; i >= 0; i -= 1 { - if s[i] == c { - return i - } - } - return -1 - } - - when ODIN_OPTIMIZATION_MODE > .Minimal { - return #force_inline simd_util.last_index_byte(transmute([]u8)s, c) - } else { - return _last_index_byte(s, c) - } + return #force_inline bytes.last_index_byte(transmute([]u8)s, c) } /* Returns the byte offset of the first rune `r` in the string `s` it finds, -1 when not found. diff --git a/examples/all/all_main.odin b/examples/all/all_main.odin index 43ea0de98..d92a6b8c4 100644 --- a/examples/all/all_main.odin +++ b/examples/all/all_main.odin @@ -115,7 +115,6 @@ import relative "core:relative" import reflect "core:reflect" import runtime "base:runtime" import simd "core:simd" -import simd_util "core:simd/util" import x86 "core:simd/x86" import slice "core:slice" import slice_heap "core:slice/heap" @@ -238,7 +237,6 @@ _ :: relative _ :: reflect _ :: runtime _ :: simd -_ :: simd_util _ :: x86 _ :: slice _ :: slice_heap diff --git a/tests/benchmark/all.odin b/tests/benchmark/all.odin index 357d86f67..4fdf82a49 100644 --- a/tests/benchmark/all.odin +++ b/tests/benchmark/all.odin @@ -1,5 +1,5 @@ package benchmarks +@(require) import "bytes" @(require) import "crypto" @(require) import "hash" -@(require) import "simd/util" diff --git a/tests/benchmark/simd/util/benchmark_simd_util.odin b/tests/benchmark/bytes/benchmark_bytes.odin similarity index 75% rename from tests/benchmark/simd/util/benchmark_simd_util.odin rename to tests/benchmark/bytes/benchmark_bytes.odin index e2187ce45..d303e81dd 100644 --- a/tests/benchmark/simd/util/benchmark_simd_util.odin +++ b/tests/benchmark/bytes/benchmark_bytes.odin @@ -1,15 +1,15 @@ -package benchmark_simd_util +package benchmark_bytes +import "core:bytes" import "core:fmt" import "core:log" -import simd_util "core:simd/util" import "core:testing" import "core:time" // These are the normal, unoptimized algorithms. -plain_index_byte :: proc "contextless" (s: []u8, c: byte) -> (res: int) #no_bounds_check { +plain_index_byte :: proc(s: []u8, c: byte) -> (res: int) #no_bounds_check { for i := 0; i < len(s); i += 1 { if s[i] == c { return i @@ -18,7 +18,7 @@ plain_index_byte :: proc "contextless" (s: []u8, c: byte) -> (res: int) #no_boun return -1 } -plain_last_index_byte :: proc "contextless" (s: []u8, c: byte) -> (res: int) #no_bounds_check { +plain_last_index_byte :: proc(s: []u8, c: byte) -> (res: int) #no_bounds_check { for i := len(s)-1; i >= 0; i -= 1 { if s[i] == c { return i @@ -37,7 +37,7 @@ sizes := [?]int { 1024 * 1024 * 1024, } -run_trial_size :: proc(p: proc "contextless" ([]u8, byte) -> int, size: int, idx: int, warmup: int, runs: int) -> (timing: time.Duration) { +run_trial_size :: proc(p: proc([]u8, byte) -> int, size: int, idx: int, warmup: int, runs: int) -> (timing: time.Duration) { data := make([]u8, size) defer delete(data) @@ -95,9 +95,9 @@ benchmark_plain_index_hot :: proc(t: ^testing.T) { benchmark_simd_index_cold :: proc(t: ^testing.T) { report: string for size in sizes { - timing := run_trial_size(simd_util.index_byte, size, size - 1, 0, 1) + timing := run_trial_size(bytes.index_byte, size, size - 1, 0, 1) report = fmt.tprintf("%s\n +++ % 8M | %v", report, size, timing) - timing = run_trial_size(simd_util.last_index_byte, size, 0, 0, 1) + timing = run_trial_size(bytes.last_index_byte, size, 0, 0, 1) report = fmt.tprintf("%s\n (last) +++ % 8M | %v", report, size, timing) } log.info(report) @@ -107,9 +107,9 @@ benchmark_simd_index_cold :: proc(t: ^testing.T) { benchmark_simd_index_hot :: proc(t: ^testing.T) { report: string for size in sizes { - timing := run_trial_size(simd_util.index_byte, size, size - 1, HOT, HOT) + timing := run_trial_size(bytes.index_byte, size, size - 1, HOT, HOT) report = fmt.tprintf("%s\n +++ % 8M | %v", report, size, timing) - timing = run_trial_size(simd_util.last_index_byte, size, 0, HOT, HOT) + timing = run_trial_size(bytes.last_index_byte, size, 0, HOT, HOT) report = fmt.tprintf("%s\n (last) +++ % 8M | %v", report, size, timing) } log.info(report) diff --git a/tests/core/simd/util/test_core_simd_util.odin b/tests/core/bytes/test_core_bytes.odin similarity index 54% rename from tests/core/simd/util/test_core_simd_util.odin rename to tests/core/bytes/test_core_bytes.odin index ba302a121..9074c0205 100644 --- a/tests/core/simd/util/test_core_simd_util.odin +++ b/tests/core/bytes/test_core_bytes.odin @@ -1,6 +1,6 @@ -package test_core_simd_util +package test_core_bytes -import simd_util "core:simd/util" +import "core:bytes" import "core:testing" @test @@ -15,30 +15,30 @@ test_index_byte_sanity :: proc(t: ^testing.T) { // Find it at the end. data[n-1] = 'o' - if !testing.expect_value(t, simd_util.index_byte(data, 'o'), n-1) { + if !testing.expect_value(t, bytes.index_byte(data, 'o'), n-1) { return } - if !testing.expect_value(t, simd_util.last_index_byte(data, 'o'), n-1) { + if !testing.expect_value(t, bytes.last_index_byte(data, 'o'), n-1) { return } data[n-1] = '-' // Find it in the middle. data[n/2] = 'o' - if !testing.expect_value(t, simd_util.index_byte(data, 'o'), n/2) { + if !testing.expect_value(t, bytes.index_byte(data, 'o'), n/2) { return } - if !testing.expect_value(t, simd_util.last_index_byte(data, 'o'), n/2) { + if !testing.expect_value(t, bytes.last_index_byte(data, 'o'), n/2) { return } data[n/2] = '-' // Find it at the start. data[0] = 'o' - if !testing.expect_value(t, simd_util.index_byte(data, 'o'), 0) { + if !testing.expect_value(t, bytes.index_byte(data, 'o'), 0) { return } - if !testing.expect_value(t, simd_util.last_index_byte(data, 'o'), 0) { + if !testing.expect_value(t, bytes.last_index_byte(data, 'o'), 0) { return } } @@ -47,8 +47,8 @@ test_index_byte_sanity :: proc(t: ^testing.T) { @test test_index_byte_empty :: proc(t: ^testing.T) { a: [1]u8 - testing.expect_value(t, simd_util.index_byte(a[0:0], 'o'), -1) - testing.expect_value(t, simd_util.last_index_byte(a[0:0], 'o'), -1) + testing.expect_value(t, bytes.index_byte(a[0:0], 'o'), -1) + testing.expect_value(t, bytes.last_index_byte(a[0:0], 'o'), -1) } @test @@ -65,12 +65,12 @@ test_index_byte_multiple_hits :: proc(t: ^testing.T) { data[n-5] = 'o' // Find the first one. - if !testing.expect_value(t, simd_util.index_byte(data, 'o'), n-5) { + if !testing.expect_value(t, bytes.index_byte(data, 'o'), n-5) { return } // Find the last one. - if !testing.expect_value(t, simd_util.last_index_byte(data, 'o'), n-1) { + if !testing.expect_value(t, bytes.last_index_byte(data, 'o'), n-1) { return } } @@ -88,19 +88,19 @@ test_index_byte_zero :: proc(t: ^testing.T) { // Positive hit. data[n-1] = 0 - if !testing.expect_value(t, simd_util.index_byte(data[:n], 0), n-1) { + if !testing.expect_value(t, bytes.index_byte(data[:n], 0), n-1) { return } - if !testing.expect_value(t, simd_util.last_index_byte(data[:n], 0), n-1) { + if !testing.expect_value(t, bytes.last_index_byte(data[:n], 0), n-1) { return } // Test for false positives. data[n-1] = '-' - if !testing.expect_value(t, simd_util.index_byte(data[:n], 0), -1) { + if !testing.expect_value(t, bytes.index_byte(data[:n], 0), -1) { return } - if !testing.expect_value(t, simd_util.last_index_byte(data[:n], 0), -1) { + if !testing.expect_value(t, bytes.last_index_byte(data[:n], 0), -1) { return } } @@ -117,22 +117,22 @@ test_misaligned_data :: proc(t: ^testing.T) { for m in 1..