From dd5b7852ce569027e87d77f46601210aa4180947 Mon Sep 17 00:00:00 2001
From: Barinzaya <barinzaya@gmail.com>
Date: Mon, 5 May 2025 15:13:10 -0400
Subject: [PATCH] Added alternate reduce-add/reduce-mul intrinsics.

The new reduce_add/reduce_mul procs perform the corresponding arithmetic
reduction in different orders than sequential order. These alternative
orders can often offer better SIMD hardware utilization.

Two different orders are added: pair-wise (operating on pairs of
adjacent elements) or bisection-wise (operating element-wise on the
first and last N/2 elements of the vector).
---
 base/intrinsics/intrinsics.odin |   4 +
 core/simd/simd.odin             | 194 +++++++++++++++++++++++++++++++-
 src/check_builtin.cpp           |   4 +
 src/checker_builtin_procs.hpp   |   8 ++
 src/llvm_backend_proc.cpp       |  66 +++++++++++
 5 files changed, 274 insertions(+), 2 deletions(-)

diff --git a/base/intrinsics/intrinsics.odin b/base/intrinsics/intrinsics.odin
index c275dedaf..9429ec023 100644
--- a/base/intrinsics/intrinsics.odin
+++ b/base/intrinsics/intrinsics.odin
@@ -274,8 +274,12 @@ simd_lanes_ge :: proc(a, b: #simd[N]T) -> #simd[N]Integer ---
 simd_extract :: proc(a: #simd[N]T, idx: uint) -> T ---
 simd_replace :: proc(a: #simd[N]T, idx: uint, elem: T) -> #simd[N]T ---
 
+simd_reduce_add_bisect  :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
+simd_reduce_mul_bisect  :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
 simd_reduce_add_ordered :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
 simd_reduce_mul_ordered :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
+simd_reduce_add_pairs   :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
+simd_reduce_mul_pairs   :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
 simd_reduce_min         :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
 simd_reduce_max         :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
 simd_reduce_and         :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
diff --git a/core/simd/simd.odin b/core/simd/simd.odin
index 0e69304c3..a97155f58 100644
--- a/core/simd/simd.odin
+++ b/core/simd/simd.odin
@@ -1759,7 +1759,103 @@ Returns:
 replace :: intrinsics.simd_replace
 
 /*
-Reduce a vector to a scalar by adding up all the lanes.
+Reduce a vector to a scalar by adding up all the lanes in a bisecting fashion.
+
+This procedure returns a scalar that is the sum of all lanes, calculated by
+bisecting the vector into two parts, where the first contains lanes [0, N/2)
+and the second contains lanes [N/2, N), and adding the two halves element-wise
+to produce N/2 values. This is repeated until only a single element remains.
+This order may be faster to compute than the ordered sum for floats, as it can
+often be better parallelized.
+
+The order of the sum may be important for accounting for precision errors in
+floating-point computation, as floating-point addition is not associative, that
+is `(a+b)+c` may not be equal to `a+(b+c)`.
+
+Inputs:
+- `v`: The vector to reduce.
+
+Result:
+- Sum of all lanes, as a scalar.
+
+**Operation**:
+
+	for n > 1 {
+		n = n / 2
+		for i in 0 ..< n {
+			a[i] += a[i+n]
+		}
+	}
+	res := a[0]
+
+Graphical representation of the operation for N=4:
+
+	     +-----------------------+
+	     | v0  | v1  | v2  | v3  |
+	     +-----------------------+
+	        |     |     |     |
+	       [+]<-- | ---'      |
+	        |    [+]<--------'
+	        |     |
+	        `>[+]<'
+	           |
+	           v
+	        +-----+
+	result: | y0  |
+	        +-----+
+*/
+reduce_add_bisect :: intrinsics.simd_reduce_add_bisect
+
+/*
+Reduce a vector to a scalar by multiplying up all the lanes in a bisecting fashion.
+
+This procedure returns a scalar that is the product of all lanes, calculated by
+bisecting the vector into two parts, where the first contains indices [0, N/2)
+and the second contains indices [N/2, N), and multiplying the two halves
+together element-wise to produce N/2 values. This is repeated until only a
+single element remains. This order may be faster to compute than the ordered
+product for floats, as it can often be better parallelized.
+
+The order of the product may be important for accounting for precision errors
+in floating-point computation, as floating-point multiplication is not
+associative, that is `(a*b)*c` may not be equal to `a*(b*c)`.
+
+Inputs:
+- `v`: The vector to reduce.
+
+Result:
+- Product of all lanes, as a scalar.
+
+**Operation**:
+
+	for n > 1 {
+		n = n / 2
+		for i in 0 ..< n {
+			a[i] *= a[i+n]
+		}
+	}
+	res := a[0]
+
+Graphical representation of the operation for N=4:
+
+	     +-----------------------+
+	     | v0  | v1  | v2  | v3  |
+	     +-----------------------+
+	        |     |     |     |
+	       [x]<-- | ---'      |
+	        |    [x]<--------'
+	        |     |
+	        `>[x]<'
+	           |
+	           v
+	        +-----+
+	result: | y0  |
+	        +-----+
+*/
+reduce_mul_bisect :: intrinsics.simd_reduce_mul_bisect
+
+/*
+Reduce a vector to a scalar by adding up all the lanes in an ordered fashion.
 
 This procedure returns a scalar that is the ordered sum of all lanes. The
 ordered sum may be important for accounting for precision errors in
@@ -1782,7 +1878,7 @@ Result:
 reduce_add_ordered :: intrinsics.simd_reduce_add_ordered
 
 /*
-Reduce a vector to a scalar by multiplying all the lanes.
+Reduce a vector to a scalar by multiplying all the lanes in an ordered fashion.
 
 This procedure returns a scalar that is the ordered product of all lanes.
 The ordered product may be important for accounting for precision errors in
@@ -1804,6 +1900,100 @@ Result:
 */
 reduce_mul_ordered :: intrinsics.simd_reduce_mul_ordered
 
+/*
+Reduce a vector to a scalar by adding up all the lanes in a pairwise fashion.
+
+This procedure returns a scalar that is the sum of all lanes, calculated by
+adding each even-indexed element with the following odd-indexed element to
+produce N/2 values. This is repeated until only a single element remains. This
+order is supported by hardware instructions for some types/architectures (e.g.
+i16/i32/f32/f64 on x86 SSE, i8/i16/i32/f32 on ARM NEON).
+
+The order of the sum may be important for accounting for precision errors in
+floating-point computation, as floating-point addition is not associative, that
+is `(a+b)+c` may not be equal to `a+(b+c)`.
+
+Inputs:
+- `v`: The vector to reduce.
+
+Result:
+- Sum of all lanes, as a scalar.
+
+**Operation**:
+
+	for n > 1 {
+		n = n / 2
+		for i in 0 ..< n {
+			a[i] = a[2*i+0] + a[2*i+1]
+		}
+	}
+	res := a[0]
+
+Graphical representation of the operation for N=4:
+
+	   +-----------------------+
+	v: | v0  | v1  | v2  | v3  |
+	   +-----------------------+
+	      |     |     |     |
+	      `>[+]<'     `>[+]<'
+	         |           |
+	         `--->[+]<--'
+	               |
+	               v
+	            +-----+
+	    result: | y0  |
+	            +-----+
+*/
+reduce_add_pairs :: intrinsics.simd_reduce_add_pairs
+
+/*
+Reduce a vector to a scalar by multiplying all the lanes in a pairwise fashion.
+
+This procedure returns a scalar that is the product of all lanes, calculated by
+bisecting the vector into two parts, where the first contains lanes [0, N/2)
+and the second contains lanes [N/2, N), and multiplying the two halves together
+multiplying each even-indexed element with the following odd-indexed element to
+produce N/2 values. This is repeated until only a single element remains. This
+order may be faster to compute than the ordered product for floats, as it can
+often be better parallelized.
+
+The order of the product may be important for accounting for precision errors
+in floating-point computation, as floating-point multiplication is not
+associative, that is `(a*b)*c` may not be equal to `a*(b*c)`.
+
+Inputs:
+- `v`: The vector to reduce.
+
+Result:
+- Product of all lanes, as a scalar.
+
+**Operation**:
+
+	for n > 1 {
+		n = n / 2
+		for i in 0 ..< n {
+			a[i] = a[2*i+0] * a[2*i+1]
+		}
+	}
+	res := a[0]
+
+Graphical representation of the operation for N=4:
+
+	   +-----------------------+
+	v: | v0  | v1  | v2  | v3  |
+	   +-----------------------+
+	      |     |     |     |
+	      `>[x]<'     `>[x]<'
+	         |           |
+	         `--->[x]<--'
+	               |
+	               v
+	            +-----+
+	    result: | y0  |
+	            +-----+
+*/
+reduce_mul_pairs :: intrinsics.simd_reduce_mul_pairs
+
 /*
 Reduce a vector to a scalar by finding the minimum value between all of the lanes.
 
diff --git a/src/check_builtin.cpp b/src/check_builtin.cpp
index a315d1880..84e30d5b4 100644
--- a/src/check_builtin.cpp
+++ b/src/check_builtin.cpp
@@ -853,8 +853,12 @@ gb_internal bool check_builtin_simd_operation(CheckerContext *c, Operand *operan
 		}
 		break;
 
+	case BuiltinProc_simd_reduce_add_bisect:
+	case BuiltinProc_simd_reduce_mul_bisect:
 	case BuiltinProc_simd_reduce_add_ordered:
 	case BuiltinProc_simd_reduce_mul_ordered:
+	case BuiltinProc_simd_reduce_add_pairs:
+	case BuiltinProc_simd_reduce_mul_pairs:
 	case BuiltinProc_simd_reduce_min:
 	case BuiltinProc_simd_reduce_max:
 		{
diff --git a/src/checker_builtin_procs.hpp b/src/checker_builtin_procs.hpp
index d8ac10b11..4c4387923 100644
--- a/src/checker_builtin_procs.hpp
+++ b/src/checker_builtin_procs.hpp
@@ -170,8 +170,12 @@ BuiltinProc__simd_begin,
 	BuiltinProc_simd_extract,
 	BuiltinProc_simd_replace,
 
+	BuiltinProc_simd_reduce_add_bisect,
+	BuiltinProc_simd_reduce_mul_bisect,
 	BuiltinProc_simd_reduce_add_ordered,
 	BuiltinProc_simd_reduce_mul_ordered,
+	BuiltinProc_simd_reduce_add_pairs,
+	BuiltinProc_simd_reduce_mul_pairs,
 	BuiltinProc_simd_reduce_min,
 	BuiltinProc_simd_reduce_max,
 	BuiltinProc_simd_reduce_and,
@@ -518,8 +522,12 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = {
 	{STR_LIT("simd_extract"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("simd_replace"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 
+	{STR_LIT("simd_reduce_add_bisect"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_reduce_mul_bisect"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("simd_reduce_add_ordered"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("simd_reduce_mul_ordered"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_reduce_add_pairs"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_reduce_mul_pairs"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("simd_reduce_min"),         1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("simd_reduce_max"),         1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("simd_reduce_and"),         1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
diff --git a/src/llvm_backend_proc.cpp b/src/llvm_backend_proc.cpp
index 7bd8dea59..14157455e 100644
--- a/src/llvm_backend_proc.cpp
+++ b/src/llvm_backend_proc.cpp
@@ -1495,6 +1495,38 @@ gb_internal lbValue lb_build_builtin_simd_proc(lbProcedure *p, Ast *expr, TypeAn
 		res.value = LLVMBuildInsertElement(p->builder, arg0.value, arg2.value, arg1.value, "");
 		return res;
 
+	case BuiltinProc_simd_reduce_add_bisect:
+	case BuiltinProc_simd_reduce_mul_bisect:
+		{
+			GB_ASSERT(arg0.type->kind == Type_SimdVector);
+			i64 num_elems = arg0.type->SimdVector.count;
+
+			LLVMValueRef *indices = gb_alloc_array(temporary_allocator(), LLVMValueRef, num_elems);
+			for (i64 i = 0; i < num_elems; i++) {
+				indices[i] = lb_const_int(m, t_uint, cast(u64)i).value;
+			}
+
+			switch (builtin_id) {
+			case BuiltinProc_simd_reduce_add_bisect: op_code = is_float ? LLVMFAdd : LLVMAdd; break;
+			case BuiltinProc_simd_reduce_mul_bisect: op_code = is_float ? LLVMFMul : LLVMMul; break;
+			}
+
+			LLVMValueRef remaining = arg0.value;
+			i64 num_remaining = num_elems;
+
+			while (num_remaining > 1) {
+				num_remaining /= 2;
+				LLVMValueRef left_indices = LLVMConstVector(&indices[0], cast(unsigned)num_remaining);
+				LLVMValueRef left_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, left_indices, "");
+				LLVMValueRef right_indices = LLVMConstVector(&indices[num_remaining], cast(unsigned)num_remaining);
+				LLVMValueRef right_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, right_indices, "");
+				remaining = LLVMBuildBinOp(p->builder, op_code, left_value, right_value, "");
+			}
+
+			res.value = LLVMBuildExtractElement(p->builder, remaining, indices[0], "");
+			return res;
+		}
+
 	case BuiltinProc_simd_reduce_add_ordered:
 	case BuiltinProc_simd_reduce_mul_ordered:
 		{
@@ -1527,6 +1559,40 @@ gb_internal lbValue lb_build_builtin_simd_proc(lbProcedure *p, Ast *expr, TypeAn
 			res.value = lb_call_intrinsic(p, name, args, cast(unsigned)args_count, types, gb_count_of(types));
 			return res;
 		}
+
+	case BuiltinProc_simd_reduce_add_pairs:
+	case BuiltinProc_simd_reduce_mul_pairs:
+		{
+			GB_ASSERT(arg0.type->kind == Type_SimdVector);
+			i64 num_elems = arg0.type->SimdVector.count;
+
+			LLVMValueRef *indices = gb_alloc_array(temporary_allocator(), LLVMValueRef, num_elems);
+			for (i64 i = 0; i < num_elems/2; i++) {
+				indices[i] = lb_const_int(m, t_uint, cast(u64)(2*i)).value;
+				indices[i+num_elems/2] = lb_const_int(m, t_uint, cast(u64)(2*i+1)).value;
+			}
+
+			switch (builtin_id) {
+			case BuiltinProc_simd_reduce_add_pairs: op_code = is_float ? LLVMFAdd : LLVMAdd; break;
+			case BuiltinProc_simd_reduce_mul_pairs: op_code = is_float ? LLVMFMul : LLVMMul; break;
+			}
+
+			LLVMValueRef remaining = arg0.value;
+			i64 num_remaining = num_elems;
+
+			while (num_remaining > 1) {
+				num_remaining /= 2;
+				LLVMValueRef left_indices = LLVMConstVector(&indices[0], cast(unsigned)num_remaining);
+				LLVMValueRef left_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, left_indices, "");
+				LLVMValueRef right_indices = LLVMConstVector(&indices[num_elems/2], cast(unsigned)num_remaining);
+				LLVMValueRef right_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, right_indices, "");
+				remaining = LLVMBuildBinOp(p->builder, op_code, left_value, right_value, "");
+			}
+
+			res.value = LLVMBuildExtractElement(p->builder, remaining, indices[0], "");
+			return res;
+		}
+
 	case BuiltinProc_simd_reduce_min:
 	case BuiltinProc_simd_reduce_max:
 	case BuiltinProc_simd_reduce_and: