From dd5b7852ce569027e87d77f46601210aa4180947 Mon Sep 17 00:00:00 2001 From: Barinzaya Date: Mon, 5 May 2025 15:13:10 -0400 Subject: [PATCH] Added alternate reduce-add/reduce-mul intrinsics. The new reduce_add/reduce_mul procs perform the corresponding arithmetic reduction in different orders than sequential order. These alternative orders can often offer better SIMD hardware utilization. Two different orders are added: pair-wise (operating on pairs of adjacent elements) or bisection-wise (operating element-wise on the first and last N/2 elements of the vector). --- base/intrinsics/intrinsics.odin | 4 + core/simd/simd.odin | 194 +++++++++++++++++++++++++++++++- src/check_builtin.cpp | 4 + src/checker_builtin_procs.hpp | 8 ++ src/llvm_backend_proc.cpp | 66 +++++++++++ 5 files changed, 274 insertions(+), 2 deletions(-) diff --git a/base/intrinsics/intrinsics.odin b/base/intrinsics/intrinsics.odin index c275dedaf..9429ec023 100644 --- a/base/intrinsics/intrinsics.odin +++ b/base/intrinsics/intrinsics.odin @@ -274,8 +274,12 @@ simd_lanes_ge :: proc(a, b: #simd[N]T) -> #simd[N]Integer --- simd_extract :: proc(a: #simd[N]T, idx: uint) -> T --- simd_replace :: proc(a: #simd[N]T, idx: uint, elem: T) -> #simd[N]T --- +simd_reduce_add_bisect :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)--- +simd_reduce_mul_bisect :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)--- simd_reduce_add_ordered :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)--- simd_reduce_mul_ordered :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)--- +simd_reduce_add_pairs :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)--- +simd_reduce_mul_pairs :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)--- simd_reduce_min :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)--- simd_reduce_max :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)--- simd_reduce_and :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)--- diff --git a/core/simd/simd.odin b/core/simd/simd.odin index 0e69304c3..a97155f58 100644 --- a/core/simd/simd.odin +++ b/core/simd/simd.odin @@ -1759,7 +1759,103 @@ Returns: replace :: intrinsics.simd_replace /* -Reduce a vector to a scalar by adding up all the lanes. +Reduce a vector to a scalar by adding up all the lanes in a bisecting fashion. + +This procedure returns a scalar that is the sum of all lanes, calculated by +bisecting the vector into two parts, where the first contains lanes [0, N/2) +and the second contains lanes [N/2, N), and adding the two halves element-wise +to produce N/2 values. This is repeated until only a single element remains. +This order may be faster to compute than the ordered sum for floats, as it can +often be better parallelized. + +The order of the sum may be important for accounting for precision errors in +floating-point computation, as floating-point addition is not associative, that +is `(a+b)+c` may not be equal to `a+(b+c)`. + +Inputs: +- `v`: The vector to reduce. + +Result: +- Sum of all lanes, as a scalar. + +**Operation**: + + for n > 1 { + n = n / 2 + for i in 0 ..< n { + a[i] += a[i+n] + } + } + res := a[0] + +Graphical representation of the operation for N=4: + + +-----------------------+ + | v0 | v1 | v2 | v3 | + +-----------------------+ + | | | | + [+]<-- | ---' | + | [+]<--------' + | | + `>[+]<' + | + v + +-----+ + result: | y0 | + +-----+ +*/ +reduce_add_bisect :: intrinsics.simd_reduce_add_bisect + +/* +Reduce a vector to a scalar by multiplying up all the lanes in a bisecting fashion. + +This procedure returns a scalar that is the product of all lanes, calculated by +bisecting the vector into two parts, where the first contains indices [0, N/2) +and the second contains indices [N/2, N), and multiplying the two halves +together element-wise to produce N/2 values. This is repeated until only a +single element remains. This order may be faster to compute than the ordered +product for floats, as it can often be better parallelized. + +The order of the product may be important for accounting for precision errors +in floating-point computation, as floating-point multiplication is not +associative, that is `(a*b)*c` may not be equal to `a*(b*c)`. + +Inputs: +- `v`: The vector to reduce. + +Result: +- Product of all lanes, as a scalar. + +**Operation**: + + for n > 1 { + n = n / 2 + for i in 0 ..< n { + a[i] *= a[i+n] + } + } + res := a[0] + +Graphical representation of the operation for N=4: + + +-----------------------+ + | v0 | v1 | v2 | v3 | + +-----------------------+ + | | | | + [x]<-- | ---' | + | [x]<--------' + | | + `>[x]<' + | + v + +-----+ + result: | y0 | + +-----+ +*/ +reduce_mul_bisect :: intrinsics.simd_reduce_mul_bisect + +/* +Reduce a vector to a scalar by adding up all the lanes in an ordered fashion. This procedure returns a scalar that is the ordered sum of all lanes. The ordered sum may be important for accounting for precision errors in @@ -1782,7 +1878,7 @@ Result: reduce_add_ordered :: intrinsics.simd_reduce_add_ordered /* -Reduce a vector to a scalar by multiplying all the lanes. +Reduce a vector to a scalar by multiplying all the lanes in an ordered fashion. This procedure returns a scalar that is the ordered product of all lanes. The ordered product may be important for accounting for precision errors in @@ -1804,6 +1900,100 @@ Result: */ reduce_mul_ordered :: intrinsics.simd_reduce_mul_ordered +/* +Reduce a vector to a scalar by adding up all the lanes in a pairwise fashion. + +This procedure returns a scalar that is the sum of all lanes, calculated by +adding each even-indexed element with the following odd-indexed element to +produce N/2 values. This is repeated until only a single element remains. This +order is supported by hardware instructions for some types/architectures (e.g. +i16/i32/f32/f64 on x86 SSE, i8/i16/i32/f32 on ARM NEON). + +The order of the sum may be important for accounting for precision errors in +floating-point computation, as floating-point addition is not associative, that +is `(a+b)+c` may not be equal to `a+(b+c)`. + +Inputs: +- `v`: The vector to reduce. + +Result: +- Sum of all lanes, as a scalar. + +**Operation**: + + for n > 1 { + n = n / 2 + for i in 0 ..< n { + a[i] = a[2*i+0] + a[2*i+1] + } + } + res := a[0] + +Graphical representation of the operation for N=4: + + +-----------------------+ + v: | v0 | v1 | v2 | v3 | + +-----------------------+ + | | | | + `>[+]<' `>[+]<' + | | + `--->[+]<--' + | + v + +-----+ + result: | y0 | + +-----+ +*/ +reduce_add_pairs :: intrinsics.simd_reduce_add_pairs + +/* +Reduce a vector to a scalar by multiplying all the lanes in a pairwise fashion. + +This procedure returns a scalar that is the product of all lanes, calculated by +bisecting the vector into two parts, where the first contains lanes [0, N/2) +and the second contains lanes [N/2, N), and multiplying the two halves together +multiplying each even-indexed element with the following odd-indexed element to +produce N/2 values. This is repeated until only a single element remains. This +order may be faster to compute than the ordered product for floats, as it can +often be better parallelized. + +The order of the product may be important for accounting for precision errors +in floating-point computation, as floating-point multiplication is not +associative, that is `(a*b)*c` may not be equal to `a*(b*c)`. + +Inputs: +- `v`: The vector to reduce. + +Result: +- Product of all lanes, as a scalar. + +**Operation**: + + for n > 1 { + n = n / 2 + for i in 0 ..< n { + a[i] = a[2*i+0] * a[2*i+1] + } + } + res := a[0] + +Graphical representation of the operation for N=4: + + +-----------------------+ + v: | v0 | v1 | v2 | v3 | + +-----------------------+ + | | | | + `>[x]<' `>[x]<' + | | + `--->[x]<--' + | + v + +-----+ + result: | y0 | + +-----+ +*/ +reduce_mul_pairs :: intrinsics.simd_reduce_mul_pairs + /* Reduce a vector to a scalar by finding the minimum value between all of the lanes. diff --git a/src/check_builtin.cpp b/src/check_builtin.cpp index a315d1880..84e30d5b4 100644 --- a/src/check_builtin.cpp +++ b/src/check_builtin.cpp @@ -853,8 +853,12 @@ gb_internal bool check_builtin_simd_operation(CheckerContext *c, Operand *operan } break; + case BuiltinProc_simd_reduce_add_bisect: + case BuiltinProc_simd_reduce_mul_bisect: case BuiltinProc_simd_reduce_add_ordered: case BuiltinProc_simd_reduce_mul_ordered: + case BuiltinProc_simd_reduce_add_pairs: + case BuiltinProc_simd_reduce_mul_pairs: case BuiltinProc_simd_reduce_min: case BuiltinProc_simd_reduce_max: { diff --git a/src/checker_builtin_procs.hpp b/src/checker_builtin_procs.hpp index d8ac10b11..4c4387923 100644 --- a/src/checker_builtin_procs.hpp +++ b/src/checker_builtin_procs.hpp @@ -170,8 +170,12 @@ BuiltinProc__simd_begin, BuiltinProc_simd_extract, BuiltinProc_simd_replace, + BuiltinProc_simd_reduce_add_bisect, + BuiltinProc_simd_reduce_mul_bisect, BuiltinProc_simd_reduce_add_ordered, BuiltinProc_simd_reduce_mul_ordered, + BuiltinProc_simd_reduce_add_pairs, + BuiltinProc_simd_reduce_mul_pairs, BuiltinProc_simd_reduce_min, BuiltinProc_simd_reduce_max, BuiltinProc_simd_reduce_and, @@ -518,8 +522,12 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = { {STR_LIT("simd_extract"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, {STR_LIT("simd_replace"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_reduce_add_bisect"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_reduce_mul_bisect"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, {STR_LIT("simd_reduce_add_ordered"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, {STR_LIT("simd_reduce_mul_ordered"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_reduce_add_pairs"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("simd_reduce_mul_pairs"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, {STR_LIT("simd_reduce_min"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, {STR_LIT("simd_reduce_max"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, {STR_LIT("simd_reduce_and"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, diff --git a/src/llvm_backend_proc.cpp b/src/llvm_backend_proc.cpp index 7bd8dea59..14157455e 100644 --- a/src/llvm_backend_proc.cpp +++ b/src/llvm_backend_proc.cpp @@ -1495,6 +1495,38 @@ gb_internal lbValue lb_build_builtin_simd_proc(lbProcedure *p, Ast *expr, TypeAn res.value = LLVMBuildInsertElement(p->builder, arg0.value, arg2.value, arg1.value, ""); return res; + case BuiltinProc_simd_reduce_add_bisect: + case BuiltinProc_simd_reduce_mul_bisect: + { + GB_ASSERT(arg0.type->kind == Type_SimdVector); + i64 num_elems = arg0.type->SimdVector.count; + + LLVMValueRef *indices = gb_alloc_array(temporary_allocator(), LLVMValueRef, num_elems); + for (i64 i = 0; i < num_elems; i++) { + indices[i] = lb_const_int(m, t_uint, cast(u64)i).value; + } + + switch (builtin_id) { + case BuiltinProc_simd_reduce_add_bisect: op_code = is_float ? LLVMFAdd : LLVMAdd; break; + case BuiltinProc_simd_reduce_mul_bisect: op_code = is_float ? LLVMFMul : LLVMMul; break; + } + + LLVMValueRef remaining = arg0.value; + i64 num_remaining = num_elems; + + while (num_remaining > 1) { + num_remaining /= 2; + LLVMValueRef left_indices = LLVMConstVector(&indices[0], cast(unsigned)num_remaining); + LLVMValueRef left_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, left_indices, ""); + LLVMValueRef right_indices = LLVMConstVector(&indices[num_remaining], cast(unsigned)num_remaining); + LLVMValueRef right_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, right_indices, ""); + remaining = LLVMBuildBinOp(p->builder, op_code, left_value, right_value, ""); + } + + res.value = LLVMBuildExtractElement(p->builder, remaining, indices[0], ""); + return res; + } + case BuiltinProc_simd_reduce_add_ordered: case BuiltinProc_simd_reduce_mul_ordered: { @@ -1527,6 +1559,40 @@ gb_internal lbValue lb_build_builtin_simd_proc(lbProcedure *p, Ast *expr, TypeAn res.value = lb_call_intrinsic(p, name, args, cast(unsigned)args_count, types, gb_count_of(types)); return res; } + + case BuiltinProc_simd_reduce_add_pairs: + case BuiltinProc_simd_reduce_mul_pairs: + { + GB_ASSERT(arg0.type->kind == Type_SimdVector); + i64 num_elems = arg0.type->SimdVector.count; + + LLVMValueRef *indices = gb_alloc_array(temporary_allocator(), LLVMValueRef, num_elems); + for (i64 i = 0; i < num_elems/2; i++) { + indices[i] = lb_const_int(m, t_uint, cast(u64)(2*i)).value; + indices[i+num_elems/2] = lb_const_int(m, t_uint, cast(u64)(2*i+1)).value; + } + + switch (builtin_id) { + case BuiltinProc_simd_reduce_add_pairs: op_code = is_float ? LLVMFAdd : LLVMAdd; break; + case BuiltinProc_simd_reduce_mul_pairs: op_code = is_float ? LLVMFMul : LLVMMul; break; + } + + LLVMValueRef remaining = arg0.value; + i64 num_remaining = num_elems; + + while (num_remaining > 1) { + num_remaining /= 2; + LLVMValueRef left_indices = LLVMConstVector(&indices[0], cast(unsigned)num_remaining); + LLVMValueRef left_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, left_indices, ""); + LLVMValueRef right_indices = LLVMConstVector(&indices[num_elems/2], cast(unsigned)num_remaining); + LLVMValueRef right_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, right_indices, ""); + remaining = LLVMBuildBinOp(p->builder, op_code, left_value, right_value, ""); + } + + res.value = LLVMBuildExtractElement(p->builder, remaining, indices[0], ""); + return res; + } + case BuiltinProc_simd_reduce_min: case BuiltinProc_simd_reduce_max: case BuiltinProc_simd_reduce_and: