From 7e34d707bb710eae6b764fc1e792c17eb879f1f8 Mon Sep 17 00:00:00 2001
From: Barinzaya <barinzaya@gmail.com>
Date: Sat, 3 May 2025 11:42:24 -0400
Subject: [PATCH 1/5] `core:simd` helpers: indices and reduce_add/mul

The indices proc simply creates a vector where each lane contains its
own lane index. This can be useful for use in generating masks for loads
and stores at the beginning/end of slices, among other things.

The new reduce_add/reduce_mul procs perform the corresponding arithmetic
reduction, in different orders than just "in sequential order". These
alternative orders can often be faster to calculate, as they can offer
better SIMD hardware utilization.

Two different orders are added for these: pair-wise (operating on
adjacent pairs of elements) or split-wise (operating element-wise on the
two halves of the vector).

This doesn't actually cover the *fastest* way for arbitrarily-sized
vectors. That would be an ordered reduction across the native vector
width, then reducing the resulting vector to a scalar in an appropriate
parallel fashion. I'd created an implementation of that, but it required
multiple procs and a fair bit more trickery than I was comfortable with
submitting to `core`, so it's not included yet. Maybe in the future.
---
 core/simd/simd.odin | 456 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 455 insertions(+), 1 deletion(-)

diff --git a/core/simd/simd.odin b/core/simd/simd.odin
index 37cc19ebd..385e24785 100644
--- a/core/simd/simd.odin
+++ b/core/simd/simd.odin
@@ -1759,7 +1759,7 @@ Returns:
 replace :: intrinsics.simd_replace
 
 /*
-Reduce a vector to a scalar by adding up all the lanes.
+Reduce a vector to a scalar by adding up all the lanes in an ordered fashion.
 
 This procedure returns a scalar that is the ordered sum of all lanes. The
 ordered sum may be important for accounting for precision errors in
@@ -2510,3 +2510,457 @@ Example:
 recip :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) {
 	return T(1) / v
 }
+
+/*
+Creates a vector where each lane contains the index of that lane.
+
+Inputs:
+- `V`: The type of the vector to create.
+
+Result:
+- A vector of the given type, where each lane contains the index of that lane.
+
+**Operation**:
+
+>	for i in 0 ..< N {
+		res[i] = i
+	}
+*/
+indices :: #force_inline proc "contextless" ($V: typeid/#simd[$N]$E) -> V where intrinsics.type_is_numeric(E) {
+	when N == 1 {
+		return {0}
+	} else when N == 2 {
+		return {0, 1}
+	} else when N == 4 {
+		return {0, 1, 2, 3}
+	} else when N == 8 {
+		return {0, 1, 2, 3, 4, 5, 6, 7}
+	} else when N == 16 {
+		return {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
+	} else when N == 32 {
+		return {
+			0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+			16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+		}
+	} else when N == 64 {
+		return {
+			0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+			16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+			32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+			48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+		}
+	} else {
+		#panic("Unsupported vector size!")
+	}
+}
+
+/*
+Reduce a vector to a scalar by adding up all the lanes in a pairwise fashion.
+
+This procedure returns a scalar that is the sum of all lanes, calculated by
+adding each even-numbered element with the following odd-numbered element. This
+is repeated until only a single element remains. This order is supported by
+hardware instructions for some types/architectures (e.g. i16/i32/f32/f64 on x86
+SSE, i8/i16/i32/f32 on ARM NEON).
+
+The order of the sum may be important for accounting for precision errors in
+floating-point computation, as floating-point addition is not associative, that
+is `(a+b)+c` may not be equal to `a+(b+c)`.
+
+Inputs:
+- `v`: The vector to reduce.
+
+Result:
+- Sum of all lanes, as a scalar.
+
+**Operation**:
+
+	for n > 1 {
+		n = n / 2
+		for i in 0 ..< n {
+			a[i] = a[2*i+0] + a[2*i+1]
+		}
+	}
+	res := a[0]
+
+Graphical representation of the operation for N=4:
+
+	   +-----------------------+
+	v: | v0  | v1  | v2  | v3  |
+	   +-----------------------+
+	      |     |     |     |
+	      `>[+]<'     `>[+]<'
+	         |           |
+	         `--->[+]<--'
+	               |
+	               v
+	            +-----+
+	    result: | y0  |
+	            +-----+
+*/
+reduce_add_pairs :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
+	where intrinsics.type_is_numeric(E) {
+	when N == 64 { v64 := v }
+	when N == 32 { v32 := v }
+	when N == 16 { v16 := v }
+	when N == 8  { v8 := v }
+	when N == 4  { v4 := v }
+	when N == 2  { v2 := v }
+
+	when N >= 64 {
+		x32 := swizzle(v64,
+			0,  2,  4,  6,  8,  10, 12, 14,
+			16, 18, 20, 22, 24, 26, 28, 30,
+			32, 34, 36, 38, 40, 42, 44, 46,
+			48, 50, 52, 54, 56, 58, 60, 62)
+		y32 := swizzle(v64,
+			1,  3,  5,  7,  9,  11, 13, 15,
+			17, 19, 21, 23, 25, 27, 29, 31,
+			33, 35, 37, 39, 41, 43, 45, 47,
+			49, 51, 53, 55, 57, 59, 61, 63)
+		v32 := x32 * y32
+	}
+
+	when N >= 32 {
+		x16 := swizzle(v32,
+			0,  2,  4,  6,  8,  10, 12, 14,
+			16, 18, 20, 22, 24, 26, 28, 30)
+		y16 := swizzle(v32,
+			1,  3,  5,  7,  9,  11, 13, 15,
+			17, 19, 21, 23, 25, 27, 29, 31)
+		v16 := x16 * y16
+	}
+
+	when N >= 16 {
+		x8 := swizzle(v16, 0, 2, 4, 6, 8, 10, 12, 14)
+		y8 := swizzle(v16, 1, 3, 5, 7, 9, 11, 13, 15)
+		v8 := x8 * y8
+	}
+
+	when N >= 8 {
+		x4 := swizzle(v8, 0, 2, 4, 6)
+		y4 := swizzle(v8, 1, 3, 5, 7)
+		v4 := x4 * y4
+	}
+
+	when N >= 4 {
+		x2 := swizzle(v4, 0, 2)
+		y2 := swizzle(v4, 1, 3)
+		v2 := x2 * y2
+	}
+
+	when N >= 2 {
+		return extract(v2, 0) * extract(v2, 1)
+	} else {
+		return extract(v, 0)
+	}
+}
+
+/*
+Reduce a vector to a scalar by adding up all the lanes in a binary fashion.
+
+This procedure returns a scalar that is the sum of all lanes, calculated by
+splitting the vector in two parts and adding the two halves together
+element-wise. This is repeated until only a single element remains. This order
+will typically be faster to compute than the ordered sum for floats, as it can
+be better parallelized.
+
+The order of the sum may be important for accounting for precision errors in
+floating-point computation, as floating-point addition is not associative, that
+is `(a+b)+c` may not be equal to `a+(b+c)`.
+
+Inputs:
+- `v`: The vector to reduce.
+
+Result:
+- Sum of all lanes, as a scalar.
+
+**Operation**:
+
+	for n > 1 {
+		n = n / 2
+		for i in 0 ..< n {
+			a[i] += a[i+n]
+		}
+	}
+	res := a[0]
+
+Graphical representation of the operation for N=4:
+
+	     +-----------------------+
+	     | v0  | v1  | v2  | v3  |
+	     +-----------------------+
+	        |     |     |     |
+	       [+]<-- | ---'      |
+	        |    [+]<--------'
+	        |     |
+	        `>[+]<'
+	           |
+	           v
+	        +-----+
+	result: | y0  |
+	        +-----+
+*/
+reduce_add_split :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
+	where intrinsics.type_is_numeric(E) {
+	when N == 64 { v64 := v }
+	when N == 32 { v32 := v }
+	when N == 16 { v16 := v }
+	when N == 8  { v8 := v }
+	when N == 4  { v4 := v }
+	when N == 2  { v2 := v }
+
+	when N >= 64 {
+		x32 := swizzle(v64,
+			0,  1,  2,  3,  4,  5,  6,  7,
+			8,  9,  10, 11, 12, 13, 14, 15,
+			16, 17, 18, 19, 20, 21, 22, 23,
+			24, 25, 26, 27, 28, 29, 30, 31)
+		y32 := swizzle(v64,
+			32, 33, 34, 35, 36, 37, 38, 39,
+			40, 41, 42, 43, 44, 45, 46, 47,
+			48, 49, 50, 51, 52, 53, 54, 55,
+			56, 57, 58, 59, 60, 61, 62, 63)
+		v32 := x32 + y32
+	}
+
+	when N >= 32 {
+		x16 := swizzle(v32,
+			0,  1,  2,  3,  4,  5,  6,  7,
+			8,  9,  10, 11, 12, 13, 14, 15)
+		y16 := swizzle(v32,
+			16, 17, 18, 19, 20, 21, 22, 23,
+			24, 25, 26, 27, 28, 29, 30, 31)
+		v16 := x16 + y16
+	}
+
+	when N >= 16 {
+		x8 := swizzle(v16, 0, 1, 2,  3,  4,  5,  6,  7)
+		y8 := swizzle(v16, 8, 9, 10, 11, 12, 13, 14, 15)
+		v8 := x8 + y8
+	}
+
+	when N >= 8 {
+		x4 := swizzle(v8, 0, 1, 2, 3)
+		y4 := swizzle(v8, 4, 5, 6, 7)
+		v4 := x4 + y4
+	}
+
+	when N >= 4 {
+		x2 := swizzle(v4, 0, 1)
+		y2 := swizzle(v4, 2, 3)
+		v2 := x2 + y2
+	}
+
+	when N >= 2 {
+		return extract(v2, 0) + extract(v2, 1)
+	} else {
+		return extract(v, 0)
+	}
+}
+
+/*
+Reduce a vector to a scalar by multiplying all the lanes in a pairwise fashion.
+
+This procedure returns a scalar that is the product of all lanes, calculated by
+multiplying each even-numbered element with the following odd-numbered element.
+This is repeated until only a single element remains. This order may be faster
+to compute than the ordered product for floats, as it can be better
+parallelized.
+
+The order of the product may be important for accounting for precision errors
+in floating-point computation, as floating-point multiplication is not
+associative, that is `(a*b)*c` may not be equal to `a*(b*c)`.
+
+Inputs:
+- `v`: The vector to reduce.
+
+Result:
+- Product of all lanes, as a scalar.
+
+**Operation**:
+
+	for n > 1 {
+		n = n / 2
+		for i in 0 ..< n {
+			a[i] = a[2*i+0] * a[2*i+1]
+		}
+	}
+	res := a[0]
+
+Graphical representation of the operation for N=4:
+
+	   +-----------------------+
+	v: | v0  | v1  | v2  | v3  |
+	   +-----------------------+
+	      |     |     |     |
+	      `>[x]<'     `>[x]<'
+	         |           |
+	         `--->[x]<--'
+	               |
+	               v
+	            +-----+
+	    result: | y0  |
+	            +-----+
+*/
+reduce_mul_pairs :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
+	where intrinsics.type_is_numeric(E) {
+	when N == 64 { v64 := v }
+	when N == 32 { v32 := v }
+	when N == 16 { v16 := v }
+	when N == 8  { v8 := v }
+	when N == 4  { v4 := v }
+	when N == 2  { v2 := v }
+
+	when N >= 64 {
+		x32 := swizzle(v64,
+			0,  2,  4,  6,  8,  10, 12, 14,
+			16, 18, 20, 22, 24, 26, 28, 30,
+			32, 34, 36, 38, 40, 42, 44, 46,
+			48, 50, 52, 54, 56, 58, 60, 62)
+		y32 := swizzle(v64,
+			1,  3,  5,  7,  9,  11, 13, 15,
+			17, 19, 21, 23, 25, 27, 29, 31,
+			33, 35, 37, 39, 41, 43, 45, 47,
+			49, 51, 53, 55, 57, 59, 61, 63)
+		v32 := x32 * y32
+	}
+
+	when N >= 32 {
+		x16 := swizzle(v32,
+			0,  2,  4,  6,  8,  10, 12, 14,
+			16, 18, 20, 22, 24, 26, 28, 30)
+		y16 := swizzle(v32,
+			1,  3,  5,  7,  9,  11, 13, 15,
+			17, 19, 21, 23, 25, 27, 29, 31)
+		v16 := x16 * y16
+	}
+
+	when N >= 16 {
+		x8 := swizzle(v16, 0, 2, 4, 6, 8, 10, 12, 14)
+		y8 := swizzle(v16, 1, 3, 5, 7, 9, 11, 13, 15)
+		v8 := x8 * y8
+	}
+
+	when N >= 8 {
+		x4 := swizzle(v8, 0, 2, 4, 6)
+		y4 := swizzle(v8, 1, 3, 5, 7)
+		v4 := x4 * y4
+	}
+
+	when N >= 4 {
+		x2 := swizzle(v4, 0, 2)
+		y2 := swizzle(v4, 1, 3)
+		v2 := x2 * y2
+	}
+
+	when N >= 2 {
+		return extract(v2, 0) * extract(v2, 1)
+	} else {
+		return extract(v, 0)
+	}
+}
+
+/*
+Reduce a vector to a scalar by multiplying up all the lanes in a binary fashion.
+
+This procedure returns a scalar that is the product of all lanes, calculated by
+splitting the vector in two parts and multiplying the two halves together
+element-wise until only a single element remains. This is repeated until only a
+single element remains. This order may be faster to compute than the ordered
+product for floats, as it can be better parallelized.
+
+The order of the product may be important for accounting for precision errors
+in floating-point computation, as floating-point multiplication is not
+associative, that is `(a*b)*c` may not be equal to `a*(b*c)`.
+
+Inputs:
+- `v`: The vector to reduce.
+
+Result:
+- Product of all lanes, as a scalar.
+
+**Operation**:
+
+	for n > 1 {
+		n = n / 2
+		for i in 0 ..< n {
+			a[i] *= a[i+n]
+		}
+	}
+	res := a[0]
+
+Graphical representation of the operation for N=4:
+
+	     +-----------------------+
+	     | v0  | v1  | v2  | v3  |
+	     +-----------------------+
+	        |     |     |     |
+	       [x]<-- | ---'      |
+	        |    [x]<--------'
+	        |     |
+	        `>[x]<'
+	           |
+	           v
+	        +-----+
+	result: | y0  |
+	        +-----+
+*/
+reduce_mul_split :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
+	where intrinsics.type_is_numeric(E) {
+	when N == 64 { v64 := v }
+	when N == 32 { v32 := v }
+	when N == 16 { v16 := v }
+	when N == 8  { v8 := v }
+	when N == 4  { v4 := v }
+	when N == 2  { v2 := v }
+
+	when N >= 64 {
+		x32 := swizzle(v64,
+			0,  1,  2,  3,  4,  5,  6,  7,
+			8,  9,  10, 11, 12, 13, 14, 15,
+			16, 17, 18, 19, 20, 21, 22, 23,
+			24, 25, 26, 27, 28, 29, 30, 31)
+		y32 := swizzle(v64,
+			32, 33, 34, 35, 36, 37, 38, 39,
+			40, 41, 42, 43, 44, 45, 46, 47,
+			48, 49, 50, 51, 52, 53, 54, 55,
+			56, 57, 58, 59, 60, 61, 62, 63)
+		v32 := x32 * y32
+	}
+
+	when N >= 32 {
+		x16 := swizzle(v32,
+			0,  1,  2,  3,  4,  5,  6,  7,
+			8,  9,  10, 11, 12, 13, 14, 15)
+		y16 := swizzle(v32,
+			16, 17, 18, 19, 20, 21, 22, 23,
+			24, 25, 26, 27, 28, 29, 30, 31)
+		v16 := x16 * y16
+	}
+
+	when N >= 16 {
+		x8 := swizzle(v16, 0, 1, 2,  3,  4,  5,  6,  7)
+		y8 := swizzle(v16, 8, 9, 10, 11, 12, 13, 14, 15)
+		v8 := x8 * y8
+	}
+
+	when N >= 8 {
+		x4 := swizzle(v8, 0, 1, 2, 3)
+		y4 := swizzle(v8, 4, 5, 6, 7)
+		v4 := x4 * y4
+	}
+
+	when N >= 4 {
+		x2 := swizzle(v4, 0, 1)
+		y2 := swizzle(v4, 2, 3)
+		v2 := x2 * y2
+	}
+
+	when N >= 2 {
+		return extract(v2, 0) * extract(v2, 1)
+	} else {
+		return extract(v, 0)
+	}
+}
+

From 8b6436201e1e946f81b8e6016bd701a39aa564b7 Mon Sep 17 00:00:00 2001
From: Barinzaya <barinzaya@gmail.com>
Date: Sat, 3 May 2025 13:04:11 -0400
Subject: [PATCH 2/5] Fixed a reduce_add proc doing multiplication instead.

---
 core/simd/simd.odin | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/core/simd/simd.odin b/core/simd/simd.odin
index 385e24785..3ea22cbcb 100644
--- a/core/simd/simd.odin
+++ b/core/simd/simd.odin
@@ -2618,7 +2618,7 @@ reduce_add_pairs :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
 			17, 19, 21, 23, 25, 27, 29, 31,
 			33, 35, 37, 39, 41, 43, 45, 47,
 			49, 51, 53, 55, 57, 59, 61, 63)
-		v32 := x32 * y32
+		v32 := x32 + y32
 	}
 
 	when N >= 32 {
@@ -2628,29 +2628,29 @@ reduce_add_pairs :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
 		y16 := swizzle(v32,
 			1,  3,  5,  7,  9,  11, 13, 15,
 			17, 19, 21, 23, 25, 27, 29, 31)
-		v16 := x16 * y16
+		v16 := x16 + y16
 	}
 
 	when N >= 16 {
 		x8 := swizzle(v16, 0, 2, 4, 6, 8, 10, 12, 14)
 		y8 := swizzle(v16, 1, 3, 5, 7, 9, 11, 13, 15)
-		v8 := x8 * y8
+		v8 := x8 + y8
 	}
 
 	when N >= 8 {
 		x4 := swizzle(v8, 0, 2, 4, 6)
 		y4 := swizzle(v8, 1, 3, 5, 7)
-		v4 := x4 * y4
+		v4 := x4 + y4
 	}
 
 	when N >= 4 {
 		x2 := swizzle(v4, 0, 2)
 		y2 := swizzle(v4, 1, 3)
-		v2 := x2 * y2
+		v2 := x2 + y2
 	}
 
 	when N >= 2 {
-		return extract(v2, 0) * extract(v2, 1)
+		return extract(v2, 0) + extract(v2, 1)
 	} else {
 		return extract(v, 0)
 	}

From 6ebd30033f4d97bb2a943f3b50db894620f48db8 Mon Sep 17 00:00:00 2001
From: Barinzaya <barinzaya@gmail.com>
Date: Sat, 3 May 2025 13:24:22 -0400
Subject: [PATCH 3/5] Removed an extra character that slipped into a comment.

---
 core/simd/simd.odin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/simd/simd.odin b/core/simd/simd.odin
index 3ea22cbcb..e11bdf816 100644
--- a/core/simd/simd.odin
+++ b/core/simd/simd.odin
@@ -2522,7 +2522,7 @@ Result:
 
 **Operation**:
 
->	for i in 0 ..< N {
+	for i in 0 ..< N {
 		res[i] = i
 	}
 */

From b0f53a6eaf8fcfeaffce84c9077c6955df222788 Mon Sep 17 00:00:00 2001
From: Barinzaya <barinzaya@gmail.com>
Date: Sat, 3 May 2025 17:25:20 -0400
Subject: [PATCH 4/5] Implemented suggestions on `core:simd` helpers.

Adjusted documentation, and renamed the reduce_*_split procs to
reduce_*_bisect.
---
 core/simd/simd.odin | 44 ++++++++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/core/simd/simd.odin b/core/simd/simd.odin
index e11bdf816..a2fe22b4b 100644
--- a/core/simd/simd.odin
+++ b/core/simd/simd.odin
@@ -2512,7 +2512,7 @@ recip :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where int
 }
 
 /*
-Creates a vector where each lane contains the index of that lane.
+Create a vector where each lane contains the index of that lane.
 
 Inputs:
 - `V`: The type of the vector to create.
@@ -2558,10 +2558,10 @@ indices :: #force_inline proc "contextless" ($V: typeid/#simd[$N]$E) -> V where
 Reduce a vector to a scalar by adding up all the lanes in a pairwise fashion.
 
 This procedure returns a scalar that is the sum of all lanes, calculated by
-adding each even-numbered element with the following odd-numbered element. This
-is repeated until only a single element remains. This order is supported by
-hardware instructions for some types/architectures (e.g. i16/i32/f32/f64 on x86
-SSE, i8/i16/i32/f32 on ARM NEON).
+adding each even-indexed element with the following odd-indexed element to
+produce N/2 values. This is repeated until only a single element remains. This
+order is supported by hardware instructions for some types/architectures (e.g.
+i16/i32/f32/f64 on x86 SSE, i8/i16/i32/f32 on ARM NEON).
 
 The order of the sum may be important for accounting for precision errors in
 floating-point computation, as floating-point addition is not associative, that
@@ -2657,13 +2657,14 @@ reduce_add_pairs :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
 }
 
 /*
-Reduce a vector to a scalar by adding up all the lanes in a binary fashion.
+Reduce a vector to a scalar by adding up all the lanes in a bisecting fashion.
 
 This procedure returns a scalar that is the sum of all lanes, calculated by
-splitting the vector in two parts and adding the two halves together
-element-wise. This is repeated until only a single element remains. This order
-will typically be faster to compute than the ordered sum for floats, as it can
-be better parallelized.
+bisecting the vector into two parts, where the first contains lanes [0, N/2)
+and the second contains lanes [N/2, N), and adding the two halves element-wise
+to produce N/2 values. This is repeated until only a single element remains.
+This order may be faster to compute than the ordered sum for floats, as it can
+often be better parallelized.
 
 The order of the sum may be important for accounting for precision errors in
 floating-point computation, as floating-point addition is not associative, that
@@ -2701,7 +2702,7 @@ Graphical representation of the operation for N=4:
 	result: | y0  |
 	        +-----+
 */
-reduce_add_split :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
+reduce_add_bisect :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
 	where intrinsics.type_is_numeric(E) {
 	when N == 64 { v64 := v }
 	when N == 32 { v32 := v }
@@ -2763,10 +2764,12 @@ reduce_add_split :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
 Reduce a vector to a scalar by multiplying all the lanes in a pairwise fashion.
 
 This procedure returns a scalar that is the product of all lanes, calculated by
-multiplying each even-numbered element with the following odd-numbered element.
-This is repeated until only a single element remains. This order may be faster
-to compute than the ordered product for floats, as it can be better
-parallelized.
+bisecting the vector into two parts, where the first contains lanes [0, N/2)
+and the second contains lanes [N/2, N), and multiplying the two halves together
+multiplying each even-indexed element with the following odd-indexed element to
+produce N/2 values. This is repeated until only a single element remains. This
+order may be faster to compute than the ordered product for floats, as it can
+often be better parallelized.
 
 The order of the product may be important for accounting for precision errors
 in floating-point computation, as floating-point multiplication is not
@@ -2862,13 +2865,14 @@ reduce_mul_pairs :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
 }
 
 /*
-Reduce a vector to a scalar by multiplying up all the lanes in a binary fashion.
+Reduce a vector to a scalar by multiplying up all the lanes in a bisecting fashion.
 
 This procedure returns a scalar that is the product of all lanes, calculated by
-splitting the vector in two parts and multiplying the two halves together
-element-wise until only a single element remains. This is repeated until only a
+bisecting the vector into two parts, where the first contains indices [0, N/2)
+and the second contains indices [N/2, N), and multiplying the two halves
+together element-wise to produce N/2 values. This is repeated until only a
 single element remains. This order may be faster to compute than the ordered
-product for floats, as it can be better parallelized.
+product for floats, as it can often be better parallelized.
 
 The order of the product may be important for accounting for precision errors
 in floating-point computation, as floating-point multiplication is not
@@ -2906,7 +2910,7 @@ Graphical representation of the operation for N=4:
 	result: | y0  |
 	        +-----+
 */
-reduce_mul_split :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
+reduce_mul_bisect :: #force_inline proc "contextless" (v: #simd[$N]$E) -> E
 	where intrinsics.type_is_numeric(E) {
 	when N == 64 { v64 := v }
 	when N == 32 { v32 := v }

From dd5b7852ce569027e87d77f46601210aa4180947 Mon Sep 17 00:00:00 2001
From: Barinzaya <barinzaya@gmail.com>
Date: Mon, 5 May 2025 15:13:10 -0400
Subject: [PATCH 5/5] Added alternate reduce-add/reduce-mul intrinsics.

The new reduce_add/reduce_mul procs perform the corresponding arithmetic
reduction in different orders than sequential order. These alternative
orders can often offer better SIMD hardware utilization.

Two different orders are added: pair-wise (operating on pairs of
adjacent elements) or bisection-wise (operating element-wise on the
first and last N/2 elements of the vector).
---
 base/intrinsics/intrinsics.odin |   4 +
 core/simd/simd.odin             | 194 +++++++++++++++++++++++++++++++-
 src/check_builtin.cpp           |   4 +
 src/checker_builtin_procs.hpp   |   8 ++
 src/llvm_backend_proc.cpp       |  66 +++++++++++
 5 files changed, 274 insertions(+), 2 deletions(-)

diff --git a/base/intrinsics/intrinsics.odin b/base/intrinsics/intrinsics.odin
index c275dedaf..9429ec023 100644
--- a/base/intrinsics/intrinsics.odin
+++ b/base/intrinsics/intrinsics.odin
@@ -274,8 +274,12 @@ simd_lanes_ge :: proc(a, b: #simd[N]T) -> #simd[N]Integer ---
 simd_extract :: proc(a: #simd[N]T, idx: uint) -> T ---
 simd_replace :: proc(a: #simd[N]T, idx: uint, elem: T) -> #simd[N]T ---
 
+simd_reduce_add_bisect  :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
+simd_reduce_mul_bisect  :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
 simd_reduce_add_ordered :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
 simd_reduce_mul_ordered :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
+simd_reduce_add_pairs   :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
+simd_reduce_mul_pairs   :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
 simd_reduce_min         :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
 simd_reduce_max         :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
 simd_reduce_and         :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
diff --git a/core/simd/simd.odin b/core/simd/simd.odin
index 0e69304c3..a97155f58 100644
--- a/core/simd/simd.odin
+++ b/core/simd/simd.odin
@@ -1759,7 +1759,103 @@ Returns:
 replace :: intrinsics.simd_replace
 
 /*
-Reduce a vector to a scalar by adding up all the lanes.
+Reduce a vector to a scalar by adding up all the lanes in a bisecting fashion.
+
+This procedure returns a scalar that is the sum of all lanes, calculated by
+bisecting the vector into two parts, where the first contains lanes [0, N/2)
+and the second contains lanes [N/2, N), and adding the two halves element-wise
+to produce N/2 values. This is repeated until only a single element remains.
+This order may be faster to compute than the ordered sum for floats, as it can
+often be better parallelized.
+
+The order of the sum may be important for accounting for precision errors in
+floating-point computation, as floating-point addition is not associative, that
+is `(a+b)+c` may not be equal to `a+(b+c)`.
+
+Inputs:
+- `v`: The vector to reduce.
+
+Result:
+- Sum of all lanes, as a scalar.
+
+**Operation**:
+
+	for n > 1 {
+		n = n / 2
+		for i in 0 ..< n {
+			a[i] += a[i+n]
+		}
+	}
+	res := a[0]
+
+Graphical representation of the operation for N=4:
+
+	     +-----------------------+
+	     | v0  | v1  | v2  | v3  |
+	     +-----------------------+
+	        |     |     |     |
+	       [+]<-- | ---'      |
+	        |    [+]<--------'
+	        |     |
+	        `>[+]<'
+	           |
+	           v
+	        +-----+
+	result: | y0  |
+	        +-----+
+*/
+reduce_add_bisect :: intrinsics.simd_reduce_add_bisect
+
+/*
+Reduce a vector to a scalar by multiplying up all the lanes in a bisecting fashion.
+
+This procedure returns a scalar that is the product of all lanes, calculated by
+bisecting the vector into two parts, where the first contains indices [0, N/2)
+and the second contains indices [N/2, N), and multiplying the two halves
+together element-wise to produce N/2 values. This is repeated until only a
+single element remains. This order may be faster to compute than the ordered
+product for floats, as it can often be better parallelized.
+
+The order of the product may be important for accounting for precision errors
+in floating-point computation, as floating-point multiplication is not
+associative, that is `(a*b)*c` may not be equal to `a*(b*c)`.
+
+Inputs:
+- `v`: The vector to reduce.
+
+Result:
+- Product of all lanes, as a scalar.
+
+**Operation**:
+
+	for n > 1 {
+		n = n / 2
+		for i in 0 ..< n {
+			a[i] *= a[i+n]
+		}
+	}
+	res := a[0]
+
+Graphical representation of the operation for N=4:
+
+	     +-----------------------+
+	     | v0  | v1  | v2  | v3  |
+	     +-----------------------+
+	        |     |     |     |
+	       [x]<-- | ---'      |
+	        |    [x]<--------'
+	        |     |
+	        `>[x]<'
+	           |
+	           v
+	        +-----+
+	result: | y0  |
+	        +-----+
+*/
+reduce_mul_bisect :: intrinsics.simd_reduce_mul_bisect
+
+/*
+Reduce a vector to a scalar by adding up all the lanes in an ordered fashion.
 
 This procedure returns a scalar that is the ordered sum of all lanes. The
 ordered sum may be important for accounting for precision errors in
@@ -1782,7 +1878,7 @@ Result:
 reduce_add_ordered :: intrinsics.simd_reduce_add_ordered
 
 /*
-Reduce a vector to a scalar by multiplying all the lanes.
+Reduce a vector to a scalar by multiplying all the lanes in an ordered fashion.
 
 This procedure returns a scalar that is the ordered product of all lanes.
 The ordered product may be important for accounting for precision errors in
@@ -1804,6 +1900,100 @@ Result:
 */
 reduce_mul_ordered :: intrinsics.simd_reduce_mul_ordered
 
+/*
+Reduce a vector to a scalar by adding up all the lanes in a pairwise fashion.
+
+This procedure returns a scalar that is the sum of all lanes, calculated by
+adding each even-indexed element with the following odd-indexed element to
+produce N/2 values. This is repeated until only a single element remains. This
+order is supported by hardware instructions for some types/architectures (e.g.
+i16/i32/f32/f64 on x86 SSE, i8/i16/i32/f32 on ARM NEON).
+
+The order of the sum may be important for accounting for precision errors in
+floating-point computation, as floating-point addition is not associative, that
+is `(a+b)+c` may not be equal to `a+(b+c)`.
+
+Inputs:
+- `v`: The vector to reduce.
+
+Result:
+- Sum of all lanes, as a scalar.
+
+**Operation**:
+
+	for n > 1 {
+		n = n / 2
+		for i in 0 ..< n {
+			a[i] = a[2*i+0] + a[2*i+1]
+		}
+	}
+	res := a[0]
+
+Graphical representation of the operation for N=4:
+
+	   +-----------------------+
+	v: | v0  | v1  | v2  | v3  |
+	   +-----------------------+
+	      |     |     |     |
+	      `>[+]<'     `>[+]<'
+	         |           |
+	         `--->[+]<--'
+	               |
+	               v
+	            +-----+
+	    result: | y0  |
+	            +-----+
+*/
+reduce_add_pairs :: intrinsics.simd_reduce_add_pairs
+
+/*
+Reduce a vector to a scalar by multiplying all the lanes in a pairwise fashion.
+
+This procedure returns a scalar that is the product of all lanes, calculated by
+bisecting the vector into two parts, where the first contains lanes [0, N/2)
+and the second contains lanes [N/2, N), and multiplying the two halves together
+multiplying each even-indexed element with the following odd-indexed element to
+produce N/2 values. This is repeated until only a single element remains. This
+order may be faster to compute than the ordered product for floats, as it can
+often be better parallelized.
+
+The order of the product may be important for accounting for precision errors
+in floating-point computation, as floating-point multiplication is not
+associative, that is `(a*b)*c` may not be equal to `a*(b*c)`.
+
+Inputs:
+- `v`: The vector to reduce.
+
+Result:
+- Product of all lanes, as a scalar.
+
+**Operation**:
+
+	for n > 1 {
+		n = n / 2
+		for i in 0 ..< n {
+			a[i] = a[2*i+0] * a[2*i+1]
+		}
+	}
+	res := a[0]
+
+Graphical representation of the operation for N=4:
+
+	   +-----------------------+
+	v: | v0  | v1  | v2  | v3  |
+	   +-----------------------+
+	      |     |     |     |
+	      `>[x]<'     `>[x]<'
+	         |           |
+	         `--->[x]<--'
+	               |
+	               v
+	            +-----+
+	    result: | y0  |
+	            +-----+
+*/
+reduce_mul_pairs :: intrinsics.simd_reduce_mul_pairs
+
 /*
 Reduce a vector to a scalar by finding the minimum value between all of the lanes.
 
diff --git a/src/check_builtin.cpp b/src/check_builtin.cpp
index a315d1880..84e30d5b4 100644
--- a/src/check_builtin.cpp
+++ b/src/check_builtin.cpp
@@ -853,8 +853,12 @@ gb_internal bool check_builtin_simd_operation(CheckerContext *c, Operand *operan
 		}
 		break;
 
+	case BuiltinProc_simd_reduce_add_bisect:
+	case BuiltinProc_simd_reduce_mul_bisect:
 	case BuiltinProc_simd_reduce_add_ordered:
 	case BuiltinProc_simd_reduce_mul_ordered:
+	case BuiltinProc_simd_reduce_add_pairs:
+	case BuiltinProc_simd_reduce_mul_pairs:
 	case BuiltinProc_simd_reduce_min:
 	case BuiltinProc_simd_reduce_max:
 		{
diff --git a/src/checker_builtin_procs.hpp b/src/checker_builtin_procs.hpp
index d8ac10b11..4c4387923 100644
--- a/src/checker_builtin_procs.hpp
+++ b/src/checker_builtin_procs.hpp
@@ -170,8 +170,12 @@ BuiltinProc__simd_begin,
 	BuiltinProc_simd_extract,
 	BuiltinProc_simd_replace,
 
+	BuiltinProc_simd_reduce_add_bisect,
+	BuiltinProc_simd_reduce_mul_bisect,
 	BuiltinProc_simd_reduce_add_ordered,
 	BuiltinProc_simd_reduce_mul_ordered,
+	BuiltinProc_simd_reduce_add_pairs,
+	BuiltinProc_simd_reduce_mul_pairs,
 	BuiltinProc_simd_reduce_min,
 	BuiltinProc_simd_reduce_max,
 	BuiltinProc_simd_reduce_and,
@@ -518,8 +522,12 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = {
 	{STR_LIT("simd_extract"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("simd_replace"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 
+	{STR_LIT("simd_reduce_add_bisect"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_reduce_mul_bisect"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("simd_reduce_add_ordered"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("simd_reduce_mul_ordered"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_reduce_add_pairs"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_reduce_mul_pairs"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("simd_reduce_min"),         1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("simd_reduce_max"),         1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("simd_reduce_and"),         1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
diff --git a/src/llvm_backend_proc.cpp b/src/llvm_backend_proc.cpp
index 7bd8dea59..14157455e 100644
--- a/src/llvm_backend_proc.cpp
+++ b/src/llvm_backend_proc.cpp
@@ -1495,6 +1495,38 @@ gb_internal lbValue lb_build_builtin_simd_proc(lbProcedure *p, Ast *expr, TypeAn
 		res.value = LLVMBuildInsertElement(p->builder, arg0.value, arg2.value, arg1.value, "");
 		return res;
 
+	case BuiltinProc_simd_reduce_add_bisect:
+	case BuiltinProc_simd_reduce_mul_bisect:
+		{
+			GB_ASSERT(arg0.type->kind == Type_SimdVector);
+			i64 num_elems = arg0.type->SimdVector.count;
+
+			LLVMValueRef *indices = gb_alloc_array(temporary_allocator(), LLVMValueRef, num_elems);
+			for (i64 i = 0; i < num_elems; i++) {
+				indices[i] = lb_const_int(m, t_uint, cast(u64)i).value;
+			}
+
+			switch (builtin_id) {
+			case BuiltinProc_simd_reduce_add_bisect: op_code = is_float ? LLVMFAdd : LLVMAdd; break;
+			case BuiltinProc_simd_reduce_mul_bisect: op_code = is_float ? LLVMFMul : LLVMMul; break;
+			}
+
+			LLVMValueRef remaining = arg0.value;
+			i64 num_remaining = num_elems;
+
+			while (num_remaining > 1) {
+				num_remaining /= 2;
+				LLVMValueRef left_indices = LLVMConstVector(&indices[0], cast(unsigned)num_remaining);
+				LLVMValueRef left_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, left_indices, "");
+				LLVMValueRef right_indices = LLVMConstVector(&indices[num_remaining], cast(unsigned)num_remaining);
+				LLVMValueRef right_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, right_indices, "");
+				remaining = LLVMBuildBinOp(p->builder, op_code, left_value, right_value, "");
+			}
+
+			res.value = LLVMBuildExtractElement(p->builder, remaining, indices[0], "");
+			return res;
+		}
+
 	case BuiltinProc_simd_reduce_add_ordered:
 	case BuiltinProc_simd_reduce_mul_ordered:
 		{
@@ -1527,6 +1559,40 @@ gb_internal lbValue lb_build_builtin_simd_proc(lbProcedure *p, Ast *expr, TypeAn
 			res.value = lb_call_intrinsic(p, name, args, cast(unsigned)args_count, types, gb_count_of(types));
 			return res;
 		}
+
+	case BuiltinProc_simd_reduce_add_pairs:
+	case BuiltinProc_simd_reduce_mul_pairs:
+		{
+			GB_ASSERT(arg0.type->kind == Type_SimdVector);
+			i64 num_elems = arg0.type->SimdVector.count;
+
+			LLVMValueRef *indices = gb_alloc_array(temporary_allocator(), LLVMValueRef, num_elems);
+			for (i64 i = 0; i < num_elems/2; i++) {
+				indices[i] = lb_const_int(m, t_uint, cast(u64)(2*i)).value;
+				indices[i+num_elems/2] = lb_const_int(m, t_uint, cast(u64)(2*i+1)).value;
+			}
+
+			switch (builtin_id) {
+			case BuiltinProc_simd_reduce_add_pairs: op_code = is_float ? LLVMFAdd : LLVMAdd; break;
+			case BuiltinProc_simd_reduce_mul_pairs: op_code = is_float ? LLVMFMul : LLVMMul; break;
+			}
+
+			LLVMValueRef remaining = arg0.value;
+			i64 num_remaining = num_elems;
+
+			while (num_remaining > 1) {
+				num_remaining /= 2;
+				LLVMValueRef left_indices = LLVMConstVector(&indices[0], cast(unsigned)num_remaining);
+				LLVMValueRef left_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, left_indices, "");
+				LLVMValueRef right_indices = LLVMConstVector(&indices[num_elems/2], cast(unsigned)num_remaining);
+				LLVMValueRef right_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, right_indices, "");
+				remaining = LLVMBuildBinOp(p->builder, op_code, left_value, right_value, "");
+			}
+
+			res.value = LLVMBuildExtractElement(p->builder, remaining, indices[0], "");
+			return res;
+		}
+
 	case BuiltinProc_simd_reduce_min:
 	case BuiltinProc_simd_reduce_max:
 	case BuiltinProc_simd_reduce_and: