From b2cf0755f2aa899a5a87bb92e87d0c46d01f593b Mon Sep 17 00:00:00 2001
From: gingerBill <bill@gingerbill.org>
Date: Wed, 1 Sep 2021 13:08:26 +0100
Subject: [PATCH 01/33] Add `vendor` to nightly.yml

---
 .github/workflows/nightly.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 3d58a8fd8..9ddb8ee83 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -28,6 +28,7 @@ jobs:
           cp LLVM-C.dll dist
           cp -r shared dist
           cp -r core dist
+          cp -r vendor dist
           cp -r bin dist
           cp -r examples dist
       - name: Upload artifact
@@ -51,6 +52,7 @@ jobs:
           cp odin dist
           cp -r shared dist
           cp -r core dist
+          cp -r vendor dist
           cp -r examples dist
       - name: Upload artifact
         uses: actions/upload-artifact@v1
@@ -77,6 +79,7 @@ jobs:
           cp odin dist
           cp -r shared dist
           cp -r core dist
+          cp -r vendor dist
           cp -r examples dist
       - name: Upload artifact
         uses: actions/upload-artifact@v1

From 5e520f4e08feb1324757b123b047b57ac38aadc6 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Mon, 30 Aug 2021 23:00:49 +0200
Subject: [PATCH 02/33] big: Add `reduce_2k`.

---
 core/math/big/build.bat    |   4 +-
 core/math/big/example.odin |   4 +-
 core/math/big/prime.odin   | 212 ++++++++++++++++++++++++++++++++++++-
 3 files changed, 215 insertions(+), 5 deletions(-)

diff --git a/core/math/big/build.bat b/core/math/big/build.bat
index 31480bcc8..e6049b2e1 100644
--- a/core/math/big/build.bat
+++ b/core/math/big/build.bat
@@ -1,9 +1,9 @@
 @echo off
-:odin run . -vet
+odin run . -vet
 
 set TEST_ARGS=-fast-tests
 :odin build . -build-mode:shared -show-timings -o:minimal -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
-odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
+:odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:size -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:speed -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:speed -define:MATH_BIG_EXE=false && python test.py -fast-tests %TEST_ARGS%
\ No newline at end of file
diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index 4542e9e15..e2ed30680 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -203,8 +203,8 @@ int_to_byte_little :: proc(v: ^Int) {
 }
 
 demo :: proc() {
-	a, b, c, d, e, f := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
-	defer destroy(a, b, c, d, e, f);
+	// a, b, c, d, e, f := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
+	// defer destroy(a, b, c, d, e, f);
 }
 
 main :: proc() {
diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index f35e02807..6a5bec6dc 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -15,7 +15,7 @@ package math_big
 	Determines if an Integer is divisible by one of the _PRIME_TABLE primes.
 	Returns true if it is, false if not. 
 */
-int_prime_is_divisible :: proc(a: ^Int, allocator := context.allocator) -> (res: bool, err: Error) {
+internal_int_prime_is_divisible :: proc(a: ^Int, allocator := context.allocator) -> (res: bool, err: Error) {
 	assert_if_nil(a);
 	context.allocator = allocator;
 
@@ -207,6 +207,216 @@ int_montgomery_setup :: proc(n: ^Int, allocator := context.allocator) -> (rho: D
 	return #force_inline internal_int_montgomery_setup(n);
 }
 
+/*
+	Reduces `x` mod `m`, assumes 0 < x < m**2, mu is precomputed via reduce_setup.
+	From HAC pp.604 Algorithm 14.42
+
+	Assumes `x`, `m` and `mu` all not to be `nil` and have been initialized.
+*/
+internal_int_reduce :: proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	q := &Int{};
+	defer internal_destroy(q);
+	um := m.used;
+
+	/*
+		q = x
+	*/
+	copy(q, x)                                                       or_return;
+
+	/*
+		q1 = x / b**(k-1)
+	*/
+	internal_shr_digit(q, um - 1);
+
+	/*
+		According to HAC this optimization is ok.
+	*/
+	if DIGIT(um) > DIGIT(1) << (_DIGIT_BITS - 1) {
+		mul(q, q, mu)                                                or_return;
+	} else {
+		_private_int_mul_high(q, q, mu, um)                          or_return;
+	}
+
+	/*
+		q3 = q2 / b**(k+1)
+	*/
+	internal_shr_digit(q, um + 1);
+
+	/*
+		x = x mod b**(k+1), quick (no division)
+	*/
+	internal_int_mod_bits(x, x, _DIGIT_BITS * (um + 1))              or_return;
+
+	/*
+		q = q * m mod b**(k+1), quick (no division)
+	*/
+	_private_int_mul(q, q, m, um + 1)                                or_return;
+
+	/*
+		x = x - q
+	*/
+	internal_sub(x, x, q)                                            or_return;
+
+	/*
+		If x < 0, add b**(k+1) to it.
+	*/
+	if internal_cmp(x, 0) == -1 {
+		internal_set(q, 1)                                           or_return;
+		internal_shl_digit(q, um + 1)                                or_return;
+		internal_add(x, x, q)                                        or_return;
+	}
+
+	/*
+		Back off if it's too big.
+	*/
+	for internal_cmp(x, m) != -1 {
+		internal_sub(x, x, m)                                        or_return;
+	}
+
+	return nil;
+}
+
+/*
+	Reduces `a` modulo `n`, where `n` is of the form 2**p - d.
+*/
+internal_int_reduce_2k :: proc(a, n: ^Int, d: DIGIT, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	q := &Int{};
+	defer internal_destroy(q);
+
+	internal_zero(q)                                                 or_return;
+
+	p := internal_count_bits(n);
+
+	for {
+		/*
+			q = a/2**p, a = a mod 2**p
+		*/
+		internal_shrmod(q, a, a, p)                                  or_return;
+
+		if d != 1 {
+			/*
+				q = q * d
+			*/
+			internal_mul(q, q, d)                                    or_return;
+		}
+
+		/*
+			a = a + q
+		*/
+		internal_add(a, a, q)                                        or_return;
+		if internal_cmp_mag(a, n) == -1                              { break; }
+		internal_sub(a, a, n)                                        or_return;
+	}
+
+	return nil;
+}
+
+/*
+	Reduces `a` modulo `n` where `n` is of the form 2**p - d
+	This differs from reduce_2k since "d" can be larger than a single digit.
+*/
+internal_int_reduce_2k_l :: proc(a, n, d: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	q := &Int{};
+	defer internal_destroy(q);
+
+	internal_zero(q)                                                 or_return;
+
+	p := internal_count_bits(n);
+
+	for {
+		/*
+			q = a/2**p, a = a mod 2**p
+		*/
+		internal_shrmod(q, a, a, p)                                  or_return;
+
+		/*
+			q = q * d
+		*/
+		internal_mul(q, q, d)                                        or_return;
+
+		/*
+			a = a + q
+		*/
+		internal_add(a, a, q)                                        or_return;
+		if internal_cmp_mag(a, n) == -1                              { break; }
+		internal_sub(a, a, n)                                        or_return;
+	}
+
+	return nil;
+}
+
+/*
+	Determines if `internal_int_reduce_2k` can be used.
+	Asssumes `a` not to be `nil` and to have been initialized.
+*/
+internal_int_reduce_is_2k :: proc(a: ^Int) -> (reducible: bool, err: Error) {
+	assert_if_nil(a);
+
+	if internal_is_zero(a) {
+		return false, nil;
+	} else if a.used == 1 {
+		return true, nil;
+	} else if a.used  > 1 {
+		iy := internal_count_bits(a);
+		iw := 1;
+		iz := DIGIT(1);
+
+		/*
+			Test every bit from the second digit up, must be 1.
+		*/
+		for ix := _DIGIT_BITS; ix < iy; ix += 1 {
+			if a.digit[iw] & iz == 0 {
+				return false, nil;
+			}
+
+			iz <<= 1;
+			if iz > _DIGIT_MAX {
+				iw += 1;
+				iz  = 1;
+			}
+		}
+		return true, nil;
+	} else {
+		return true, nil;
+	}
+}
+
+/*
+	Determines if `internal_int_reduce_2k_l` can be used.
+	Asssumes `a` not to be `nil` and to have been initialized.
+*/
+internal_int_reduce_is_2k_l :: proc(a: ^Int) -> (reducible: bool, err: Error) {
+	assert_if_nil(a);
+
+	if internal_int_is_zero(a) {
+		return false, nil;
+	} else if a.used == 1 {
+		return true, nil;
+	} else if a.used  > 1 {
+		/*
+			If more than half of the digits are -1 we're sold.
+		*/
+		ix := 0;
+		iy := 0;
+
+		for ; ix < a.used; ix += 1 {
+			if a.digit[ix] == _DIGIT_MAX {
+				iy += 1;
+			}
+		}
+		return iy >= (a.used / 2), nil;
+	} else {
+		return false, nil;
+	}
+}
+
+
 /*
 	Returns the number of Rabin-Miller trials needed for a given bit size.
 */

From c3a70ac277494b70e86578f1ce31923a0ca8d2c8 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Tue, 31 Aug 2021 12:15:09 +0200
Subject: [PATCH 03/33] Big: Added Barrett reduction setup.

---
 core/math/big/prime.odin | 45 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index 6a5bec6dc..6a3a098a4 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -416,6 +416,51 @@ internal_int_reduce_is_2k_l :: proc(a: ^Int) -> (reducible: bool, err: Error) {
 	}
 }
 
+/*
+	Determines the setup value.
+	Assumes `a` is not `nil`.
+*/
+internal_int_reduce_2k_setup :: proc(a: ^Int, allocator := context.allocator) -> (d: DIGIT, err: Error) {
+	context.allocator = allocator;
+
+	tmp := &Int{};
+	defer internal_destroy(tmp);
+	internal_zero(tmp)                                               or_return;
+
+	internal_int_power_of_two(tmp, internal_count_bits(a))           or_return;
+	internal_sub(tmp, tmp, a)                                        or_return;
+
+	return tmp.digit[0], nil;
+}
+
+/*
+	Determines the setup value.
+	Assumes `a` is not `nil`.
+*/
+internal_int_reduce_2k_setup_l :: proc(a, d: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	tmp := &Int{};
+	defer internal_destroy(tmp);
+	internal_zero(tmp)                                               or_return;
+
+	internal_int_power_of_two(tmp, internal_count_bits(a))           or_return;
+	internal_sub(d, tmp, a)                                          or_return;
+
+	return nil;
+}
+
+/*
+	Pre-calculate the value required for Barrett reduction.
+	For a given modulus "b" it calulates the value required in "a"
+*/
+internal_int_reduce_setup :: proc(a, b: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	internal_int_power_of_two(a, b.used * 2 * _DIGIT_BITS)           or_return;
+	return internal_int_div(a, a, b);
+}
+
 
 /*
 	Returns the number of Rabin-Miller trials needed for a given bit size.

From 65a15e9c060d74bc3a7977c8c3329ec43dc810b2 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Tue, 31 Aug 2021 16:43:07 +0200
Subject: [PATCH 04/33] big: Add `internal_int_exponent_mod`.

---
 core/math/big/api.odin      |   7 +-
 core/math/big/common.odin   |  28 ++--
 core/math/big/example.odin  |  24 +++-
 core/math/big/helpers.odin  |   3 +-
 core/math/big/internal.odin |   3 +-
 core/math/big/logical.odin  |   3 +-
 core/math/big/prime.odin    | 248 ++++++++++++++++++++++++++++++++++--
 core/math/big/private.odin  |   3 +-
 core/math/big/public.odin   |   3 +-
 core/math/big/radix.odin    |   3 +-
 core/math/big/test.odin     |   3 +-
 core/math/big/test.py       |   9 ++
 core/math/big/tune.odin     |   3 +-
 13 files changed, 294 insertions(+), 46 deletions(-)

diff --git a/core/math/big/api.odin b/core/math/big/api.odin
index 1f2eab8d7..e2761b425 100644
--- a/core/math/big/api.odin
+++ b/core/math/big/api.odin
@@ -1,14 +1,15 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
-	Made available under Odin's BSD-2 license.
+	Made available under Odin's BSD-3 license.
 
 	An arbitrary precision mathematics implementation in Odin.
 	For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
 	The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
 
 	This file collects public proc maps and their aliases.
+*/
+package math_big
+/*
 
 	=== === === === === === === === === === === === === === === === === === === === === === === ===
 	                                    Basic arithmetic.
diff --git a/core/math/big/common.odin b/core/math/big/common.odin
index ce1f7d77f..4171d25f3 100644
--- a/core/math/big/common.odin
+++ b/core/math/big/common.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -8,6 +6,7 @@ package math_big
 	For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
 	The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
 */
+package math_big
 
 import "core:intrinsics"
 
@@ -57,10 +56,10 @@ when #config(MATH_BIG_EXE, true) {
 	debugged where necessary.
 */
 
-_DEFAULT_MUL_KARATSUBA_CUTOFF :: #config(MUL_KARATSUBA_CUTOFF,  80);
-_DEFAULT_SQR_KARATSUBA_CUTOFF :: #config(SQR_KARATSUBA_CUTOFF, 120);
-_DEFAULT_MUL_TOOM_CUTOFF      :: #config(MUL_TOOM_CUTOFF,      350);
-_DEFAULT_SQR_TOOM_CUTOFF      :: #config(SQR_TOOM_CUTOFF,      400);
+_DEFAULT_MUL_KARATSUBA_CUTOFF :: #config(MATH_BIG_MUL_KARATSUBA_CUTOFF,  80);
+_DEFAULT_SQR_KARATSUBA_CUTOFF :: #config(MATH_BIG_SQR_KARATSUBA_CUTOFF, 120);
+_DEFAULT_MUL_TOOM_CUTOFF      :: #config(MATH_BIG_MUL_TOOM_CUTOFF,      350);
+_DEFAULT_SQR_TOOM_CUTOFF      :: #config(MATH_BIG_SQR_TOOM_CUTOFF,      400);
 
 
 MAX_ITERATIONS_ROOT_N := 500;
@@ -85,15 +84,22 @@ FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS := 100;
 
 	2) Optimizations thanks to precomputed masks wouldn't work.
 */
-MATH_BIG_FORCE_64_BIT :: #config(MATH_BIG_FORCE_64_BIT, false);
-MATH_BIG_FORCE_32_BIT :: #config(MATH_BIG_FORCE_32_BIT, false);
+MATH_BIG_FORCE_64_BIT   :: #config(MATH_BIG_FORCE_64_BIT, false);
+MATH_BIG_FORCE_32_BIT   :: #config(MATH_BIG_FORCE_32_BIT, false);
 when (MATH_BIG_FORCE_32_BIT && MATH_BIG_FORCE_64_BIT) { #panic("Cannot force 32-bit and 64-bit big backend simultaneously."); };
 
-_LOW_MEMORY           :: #config(BIGINT_SMALL_MEMORY, false);
+/*
+	Trade a smaller memory footprint for more processing overhead?
+*/
+_LOW_MEMORY             :: #config(MATH_BIG_SMALL_MEMORY, false);
 when _LOW_MEMORY {
-	_DEFAULT_DIGIT_COUNT :: 8;
+	_DEFAULT_DIGIT_COUNT ::   8;
+	_TAB_SIZE            ::  32;
+	_MAX_WIN_SIZE        ::   5;
 } else {
-	_DEFAULT_DIGIT_COUNT :: 32;
+	_DEFAULT_DIGIT_COUNT ::  32;
+	_TAB_SIZE            :: 256;
+	_MAX_WIN_SIZE        ::   0;
 }
 
 /*
diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index e2ed30680..18b6062d9 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -1,6 +1,4 @@
 //+ignore
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -9,6 +7,8 @@ package math_big
 	For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
 	The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
 */
+package math_big
+
 
 import "core:fmt"
 import "core:mem"
@@ -18,11 +18,14 @@ print_configation :: proc() {
 `
 Configuration:
 	_DIGIT_BITS                           %v
+	_SMALL_MEMORY                         %v
 	_MIN_DIGIT_COUNT                      %v
 	_MAX_DIGIT_COUNT                      %v
 	_DEFAULT_DIGIT_COUNT                  %v
 	_MAX_COMBA                            %v
 	_WARRAY                               %v
+	_TAB_SIZE                             %v
+	_MAX_WIN_SIZE                         %v
 Runtime tunable:
 	MUL_KARATSUBA_CUTOFF                  %v
 	SQR_KARATSUBA_CUTOFF                  %v
@@ -34,11 +37,14 @@ Runtime tunable:
 	FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS %v
 
 `, _DIGIT_BITS,
+_LOW_MEMORY,
 _MIN_DIGIT_COUNT,
 _MAX_DIGIT_COUNT,
 _DEFAULT_DIGIT_COUNT,
 _MAX_COMBA,
 _WARRAY,
+_TAB_SIZE,
+_MAX_WIN_SIZE,
 MUL_KARATSUBA_CUTOFF,
 SQR_KARATSUBA_CUTOFF,
 MUL_TOOM_CUTOFF,
@@ -203,8 +209,18 @@ int_to_byte_little :: proc(v: ^Int) {
 }
 
 demo :: proc() {
-	// a, b, c, d, e, f := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
-	// defer destroy(a, b, c, d, e, f);
+	a, b, c, d, e, f, res := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
+	defer destroy(a, b, c, d, e, f, res);
+
+	set(a, 42);
+	set(b, 6);
+	set(c, 5);
+
+	if err := internal_int_exponent_mod(res, a, b, c, 0); err != nil {
+		fmt.printf("Error: %v\n", err);
+	}
+
+	print("res: ", res);
 }
 
 main :: proc() {
diff --git a/core/math/big/helpers.odin b/core/math/big/helpers.odin
index 8ce1b2811..ff654172c 100644
--- a/core/math/big/helpers.odin
+++ b/core/math/big/helpers.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -8,6 +6,7 @@ package math_big
 	For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
 	The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
 */
+package math_big
 
 import "core:intrinsics"
 import rnd "core:math/rand"
diff --git a/core/math/big/internal.odin b/core/math/big/internal.odin
index 9422067ae..789163af2 100644
--- a/core/math/big/internal.odin
+++ b/core/math/big/internal.odin
@@ -1,6 +1,4 @@
 //+ignore
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -31,6 +29,7 @@ package math_big
 
 	TODO: Handle +/- Infinity and NaN.
 */
+package math_big
 
 import "core:mem"
 import "core:intrinsics"
diff --git a/core/math/big/logical.odin b/core/math/big/logical.odin
index 64f3b0898..1e7f8e1b1 100644
--- a/core/math/big/logical.odin
+++ b/core/math/big/logical.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -10,6 +8,7 @@ package math_big
 
 	This file contains logical operations like `and`, `or` and `xor`.
 */
+package math_big
 
 /*
 	The `and`, `or` and `xor` binops differ in two lines only.
diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index 6a3a098a4..1947ac634 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -10,6 +8,7 @@ package math_big
 
 	This file contains prime finding operations.
 */
+package math_big
 
 /*
 	Determines if an Integer is divisible by one of the _PRIME_TABLE primes.
@@ -223,7 +222,7 @@ internal_int_reduce :: proc(x, m, mu: ^Int, allocator := context.allocator) -> (
 	/*
 		q = x
 	*/
-	copy(q, x)                                                       or_return;
+	internal_copy(q, x)                                              or_return;
 
 	/*
 		q1 = x / b**(k-1)
@@ -234,7 +233,7 @@ internal_int_reduce :: proc(x, m, mu: ^Int, allocator := context.allocator) -> (
 		According to HAC this optimization is ok.
 	*/
 	if DIGIT(um) > DIGIT(1) << (_DIGIT_BITS - 1) {
-		mul(q, q, mu)                                                or_return;
+		internal_mul(q, q, mu)                                       or_return;
 	} else {
 		_private_int_mul_high(q, q, mu, um)                          or_return;
 	}
@@ -435,32 +434,257 @@ internal_int_reduce_2k_setup :: proc(a: ^Int, allocator := context.allocator) ->
 
 /*
 	Determines the setup value.
-	Assumes `a` is not `nil`.
+	Assumes `mu` and `P` are not `nil`.
+
+	d := (1 << a.bits) - a;
 */
-internal_int_reduce_2k_setup_l :: proc(a, d: ^Int, allocator := context.allocator) -> (err: Error) {
+internal_int_reduce_2k_setup_l :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
 	context.allocator = allocator;
 
 	tmp := &Int{};
 	defer internal_destroy(tmp);
 	internal_zero(tmp)                                               or_return;
 
-	internal_int_power_of_two(tmp, internal_count_bits(a))           or_return;
-	internal_sub(d, tmp, a)                                          or_return;
+	internal_int_power_of_two(tmp, internal_count_bits(P))           or_return;
+	internal_sub(mu, tmp, P)                                         or_return;
 
 	return nil;
 }
 
 /*
 	Pre-calculate the value required for Barrett reduction.
-	For a given modulus "b" it calulates the value required in "a"
+	For a given modulus "P" it calulates the value required in "mu"
+	Assumes `mu` and `P` are not `nil`.
 */
-internal_int_reduce_setup :: proc(a, b: ^Int, allocator := context.allocator) -> (err: Error) {
+internal_int_reduce_setup :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
 	context.allocator = allocator;
 
-	internal_int_power_of_two(a, b.used * 2 * _DIGIT_BITS)           or_return;
-	return internal_int_div(a, a, b);
+	internal_int_power_of_two(mu, P.used * 2 * _DIGIT_BITS)           or_return;
+	return internal_int_div(mu, mu, P);
 }
 
+/*
+	Computes res == G**X mod P.
+	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
+*/
+internal_int_exponent_mod :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	M := [_TAB_SIZE]Int{};
+	winsize: uint;
+
+	redux: #type proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error);
+
+	defer {
+		internal_destroy(&M[1]);
+		for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+			internal_destroy(&M[x]);
+		}
+	}
+
+	/*
+		Find window size.
+	*/
+	x := internal_count_bits(X);
+	switch {
+	case x <= 7:
+		winsize = 2;
+	case x <= 36:
+		winsize = 3;
+	case x <= 140:
+		winsize = 4;
+	case x <= 450:
+		winsize = 5;
+	case x <= 1303:
+		winsize = 6;
+	case x <= 3529:
+		winsize = 7;
+	case:
+		winsize = 8;
+	}
+
+	winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize;
+
+	/*
+		Init M array.
+		Init first cell.
+	*/
+	internal_zero(&M[1])                                             or_return;
+
+	/*
+		Now init the second half of the array.
+	*/
+	for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+		internal_zero(&M[x])                                         or_return;
+	}
+
+	/*
+		Create `mu`, used for Barrett reduction.
+	*/
+	mu := &Int{};
+	defer internal_destroy(mu);
+	internal_zero(mu)                                                or_return;
+
+	if redmode == 0 {
+		internal_int_reduce_setup(mu, P)                             or_return;
+		redux = internal_int_reduce;
+	} else {
+		internal_int_reduce_2k_setup_l(mu, P)                        or_return;
+		redux = internal_int_reduce_2k_l;
+	}
+
+	/*
+		Create M table.
+
+		The M table contains powers of the base, e.g. M[x] = G**x mod P.
+		The first half of the table is not computed, though, except for M[0] and M[1].
+	*/
+	internal_int_mod(&M[1], G, P)                                    or_return;
+
+	/*
+		Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
+
+		TODO: This can probably be replaced by computing the power and using `pow` to raise to it
+		instead of repeated squaring.
+	*/
+	slot := 1 << (winsize - 1);
+	internal_copy(&M[slot], &M[1])                                   or_return;
+
+	for x = 0; x < int(winsize - 1); x += 1 {
+		/*
+			Square it.
+		*/
+		internal_sqr(&M[slot], &M[slot])                             or_return;
+
+		/*
+			Reduce modulo P
+		*/
+		redux(&M[slot], P, mu)                                       or_return;
+	}
+
+	/*
+		Create upper table, that is M[x] = M[x-1] * M[1] (mod P)
+		for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
+	*/
+	for x = slot + 1; x < (1 << winsize); x += 1 {
+		internal_mul(&M[x], &M[x - 1], &M[1])                        or_return;
+		redux(&M[x], P, mu)                                          or_return;
+	}
+
+	/*
+		Setup result.
+	*/
+	internal_one(res)                                                or_return;
+
+	/*
+		Set initial mode and bit cnt.
+	*/
+	mode   := 0;
+	bitcnt := 1;
+	buf    := DIGIT(0);
+	digidx := X.used - 1;
+	bitcpy := uint(0);
+	bitbuf := DIGIT(0);
+
+	for {
+		/*
+			Grab next digit as required.
+		*/
+		bitcnt -= 1;
+		if bitcnt == 0 {
+			/*
+				If digidx == -1 we are out of digits.
+			*/
+			if digidx == -1 { break; }
+
+			/*
+				Read next digit and reset the bitcnt.
+			*/
+			buf    = X.digit[digidx];
+			digidx -= 1;
+			bitcnt = _DIGIT_BITS;
+		}
+
+		/*
+			Grab the next msb from the exponent.
+		*/
+		y := buf >> (_DIGIT_BITS - 1) & 1;
+		buf <<= 1;
+
+		/*
+			If the bit is zero and mode == 0 then we ignore it.
+			These represent the leading zero bits before the first 1 bit
+			in the exponent.  Technically this opt is not required but it
+			does lower the # of trivial squaring/reductions used.
+		*/
+		if mode == 0 && y == 0 {
+			continue;
+		}
+
+		/*
+			If the bit is zero and mode == 1 then we square.
+		*/
+		if mode == 1 && y == 0 {
+			internal_sqr(res, res)                                   or_return;
+			redux(res, P, mu)                                        or_return;
+			continue;
+		}
+
+		/*
+			Else we add it to the window.
+		*/
+		bitcpy += 1;
+		bitbuf |= (y << (winsize - bitcpy));
+		mode    = 2;
+
+		if (bitcpy == winsize) {
+			/*
+				Window is filled so square as required and multiply.
+				Square first.
+			*/
+			for x = 0; x < int(winsize); x += 1 {
+				internal_sqr(res, res)                               or_return;
+				redux(res, P, mu)                                    or_return;
+			}
+
+			/*
+				Then multiply.
+			*/
+			internal_mul(res, res, &M[bitbuf])                       or_return;
+			redux(res, P, mu)                                        or_return;
+
+			/*
+				Empty window and reset.
+			*/
+			bitcpy = 0;
+			bitbuf = 0;
+			mode   = 1;
+		}
+	}
+
+	/*
+		If bits remain then square/multiply.
+	*/
+	if mode == 2 && bitcpy > 0 {
+		/*
+			Square then multiply if the bit is set.
+		*/
+		for x = 0; x < int(bitcpy); x += 1 {
+			internal_sqr(res, res)                                   or_return;
+			redux(res, P, mu)                                        or_return;
+
+			bitbuf <<= 1;
+			if ((bitbuf & (1 << winsize)) != 0) {
+				/*
+					Then multiply.
+				*/
+				internal_mul(res, res, &M[1])                        or_return;
+				redux(res, P, mu)                                    or_return;
+			}
+		}
+	}
+	return err;
+}
 
 /*
 	Returns the number of Rabin-Miller trials needed for a given bit size.
diff --git a/core/math/big/private.odin b/core/math/big/private.odin
index d71946ce2..7e839337f 100644
--- a/core/math/big/private.odin
+++ b/core/math/big/private.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -17,6 +15,7 @@ package math_big
 
 	These aren't exported for the same reasons.
 */
+package math_big
 
 import "core:intrinsics"
 import "core:mem"
diff --git a/core/math/big/public.odin b/core/math/big/public.odin
index 542725289..d69b3ba22 100644
--- a/core/math/big/public.odin
+++ b/core/math/big/public.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -10,6 +8,7 @@ package math_big
 
 	This file contains basic arithmetic operations like `add`, `sub`, `mul`, `div`, ...
 */
+package math_big
 
 /*
 	===========================
diff --git a/core/math/big/radix.odin b/core/math/big/radix.odin
index acf0bacbd..8a7040158 100644
--- a/core/math/big/radix.odin
+++ b/core/math/big/radix.odin
@@ -1,5 +1,3 @@
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -14,6 +12,7 @@ package math_big
 		- Use Barrett reduction for non-powers-of-two.
 		- Also look at extracting and splatting several digits at once.
 */
+package math_big
 
 import "core:intrinsics"
 import "core:mem"
diff --git a/core/math/big/test.odin b/core/math/big/test.odin
index ea3c6be49..8d60fc5ee 100644
--- a/core/math/big/test.odin
+++ b/core/math/big/test.odin
@@ -1,6 +1,4 @@
 //+ignore
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -11,6 +9,7 @@ package math_big
 
 	This file exports procedures for use with the test.py test suite.
 */
+package math_big
 
 /*
 	TODO: Write tests for `internal_*` and test reusing parameters with the public implementations.
diff --git a/core/math/big/test.py b/core/math/big/test.py
index df59fa1c8..e095b061e 100644
--- a/core/math/big/test.py
+++ b/core/math/big/test.py
@@ -1,3 +1,12 @@
+#
+#	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
+#	Made available under Odin's BSD-3 license.
+#
+#	A BigInt implementation in Odin.
+#	For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
+#	The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
+#
+
 from ctypes import *
 from random import *
 import math
diff --git a/core/math/big/tune.odin b/core/math/big/tune.odin
index 700a5e74a..3381065bb 100644
--- a/core/math/big/tune.odin
+++ b/core/math/big/tune.odin
@@ -1,6 +1,4 @@
 //+ignore
-package math_big
-
 /*
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
@@ -9,6 +7,7 @@ package math_big
 	For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
 	The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
 */
+package math_big
 
 import "core:fmt"
 import "core:time"

From 2110778040787234041ee977ec456e5098f2caf7 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Tue, 31 Aug 2021 21:26:19 +0200
Subject: [PATCH 05/33] big: Add `internal_int_exponent_mod_fast`.

---
 core/math/big/example.odin  |   6 +-
 core/math/big/internal.odin |  25 ++--
 core/math/big/prime.odin    | 283 +++++++++++++++++++++++++++++++++++-
 core/math/big/private.odin  |   5 +-
 4 files changed, 302 insertions(+), 17 deletions(-)

diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index 18b6062d9..4a64aaa01 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -208,15 +208,17 @@ int_to_byte_little :: proc(v: ^Int) {
 	}
 }
 
+// printf :: fmt.printf;
+
 demo :: proc() {
 	a, b, c, d, e, f, res := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
 	defer destroy(a, b, c, d, e, f, res);
 
 	set(a, 42);
 	set(b, 6);
-	set(c, 5);
+	set(c, 131);
 
-	if err := internal_int_exponent_mod(res, a, b, c, 0); err != nil {
+	if err := internal_int_exponent_mod_fast(res, a, b, c, 0); err != nil {
 		fmt.printf("Error: %v\n", err);
 	}
 
diff --git a/core/math/big/internal.odin b/core/math/big/internal.odin
index 789163af2..ca113275c 100644
--- a/core/math/big/internal.odin
+++ b/core/math/big/internal.odin
@@ -991,13 +991,21 @@ internal_int_mod_bits :: proc(remainder, numerator: ^Int, bits: int, allocator :
 	public ones that have already satisfied these constraints.
 */
 
+/*
+	This procedure returns the allocated capacity of an Int.
+	Assumes `a` not to be `nil`.
+*/
+internal_int_allocated_cap :: #force_inline proc(a: ^Int) -> (cap: int) {
+	raw := transmute(mem.Raw_Dynamic_Array)a.digit;
+	return raw.cap;
+}
+
 /*
 	This procedure will return `true` if the `Int` is initialized, `false` if not.
 	Assumes `a` not to be `nil`.
 */
 internal_int_is_initialized :: #force_inline proc(a: ^Int) -> (initialized: bool) {
-	raw := transmute(mem.Raw_Dynamic_Array)a.digit;
-	return raw.cap >= _MIN_DIGIT_COUNT;
+	return internal_int_allocated_cap(a) >= _MIN_DIGIT_COUNT;
 }
 internal_is_initialized :: proc { internal_int_is_initialized, };
 
@@ -1650,8 +1658,7 @@ internal_int_destroy :: proc(integers: ..^Int) {
 	integers := integers;
 
 	for a in &integers {
-		raw := transmute(mem.Raw_Dynamic_Array)a.digit;
-		if raw.cap > 0 {
+		if internal_int_allocated_cap(a) > 0 {
 			mem.zero_slice(a.digit[:]);
 			free(&a.digit[0]);
 		}
@@ -1913,23 +1920,23 @@ internal_int_shrink :: proc(a: ^Int) -> (err: Error) {
 internal_shrink :: proc { internal_int_shrink, };
 
 internal_int_grow :: proc(a: ^Int, digits: int, allow_shrink := false, allocator := context.allocator) -> (err: Error) {
-	raw := transmute(mem.Raw_Dynamic_Array)a.digit;
-
 	/*
 		We need at least _MIN_DIGIT_COUNT or a.used digits, whichever is bigger.
 		The caller is asking for `digits`. Let's be accomodating.
 	*/
+	cap := internal_int_allocated_cap(a);
+
 	needed := max(_MIN_DIGIT_COUNT, a.used, digits);
 	if !allow_shrink {
-		needed = max(needed, raw.cap);
+		needed = max(needed, cap);
 	}
 
 	/*
 		If not yet iniialized, initialize the `digit` backing with the allocator we were passed.
 	*/
-	if raw.cap == 0 {
+	if cap == 0 {
 		a.digit = make([dynamic]DIGIT, needed, allocator);
-	} else if raw.cap != needed {
+	} else if cap != needed {
 		/*
 			`[dynamic]DIGIT` already knows what allocator was used for it, so resize will do the right thing.
 		*/
diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index 1947ac634..9771ef077 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -144,7 +144,7 @@ internal_int_montgomery_calc_normalization :: proc(a, b: ^Int, allocator := cont
 		power := ((b.used - 1) * _DIGIT_BITS) + bits - 1;
 		internal_int_power_of_two(a, power)                          or_return;
 	} else {
-		internal_one(a);
+		internal_one(a)                                              or_return;
 		bits = 1;
 	}
 
@@ -187,7 +187,8 @@ internal_int_montgomery_setup :: proc(n: ^Int) -> (rho: DIGIT, err: Error) {
 	x := (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
 	x *= 2 - (b * x);              /* here x*a==1 mod 2**8 */
 	x *= 2 - (b * x);              /* here x*a==1 mod 2**16 */
-	when _WORD_TYPE_BITS == 64 {
+
+	when _DIGIT_TYPE_BITS == 64 {
 		x *= 2 - (b * x);              /* here x*a==1 mod 2**32 */
 		x *= 2 - (b * x);              /* here x*a==1 mod 2**64 */
 	}
@@ -473,6 +474,10 @@ internal_int_exponent_mod :: proc(res, G, X, P: ^Int, redmode: int, allocator :=
 	M := [_TAB_SIZE]Int{};
 	winsize: uint;
 
+	/*
+		Use a pointer to the reduction algorithm.
+		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
+	*/
 	redux: #type proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error);
 
 	defer {
@@ -686,6 +691,280 @@ internal_int_exponent_mod :: proc(res, G, X, P: ^Int, redmode: int, allocator :=
 	return err;
 }
 
+/*
+	Computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
+
+	Uses a left-to-right `k`-ary sliding window to compute the modular exponentiation.
+	The value of `k` changes based on the size of the exponent.
+
+	Uses Montgomery or Diminished Radix reduction [whichever appropriate]
+
+	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
+*/
+internal_int_exponent_mod_fast :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	M := [_TAB_SIZE]Int{};
+	winsize: uint;
+
+	/*
+		Use a pointer to the reduction algorithm.
+		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
+	*/
+	redux: #type proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error);
+
+	defer {
+		internal_destroy(&M[1]);
+		for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+			internal_destroy(&M[x]);
+		}
+	}
+
+	/*
+		Find window size.
+	*/
+	x := internal_count_bits(X);
+	switch {
+	case x <= 7:
+		winsize = 2;
+	case x <= 36:
+		winsize = 3;
+	case x <= 140:
+		winsize = 4;
+	case x <= 450:
+		winsize = 5;
+	case x <= 1303:
+		winsize = 6;
+	case x <= 3529:
+		winsize = 7;
+	case:
+		winsize = 8;
+	}
+
+	winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize;
+
+	/*
+		Init M array
+		Init first cell.
+	*/
+	cap := internal_int_allocated_cap(P);
+	internal_grow(&M[1], cap)                                        or_return;
+
+	/*
+		Now init the second half of the array.
+	*/
+	for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+		internal_grow(&M[x], cap)                                    or_return;
+	}
+
+	/*
+		Determine and setup reduction code.
+	*/
+	rho: DIGIT;
+
+	if redmode == 0 {
+		/*
+			Now setup Montgomery.
+		*/
+		rho = internal_int_montgomery_setup(P)                       or_return;
+
+		/*
+			Automatically pick the comba one if available (saves quite a few calls/ifs).
+		*/
+		if ((P.used * 2) + 1) < _WARRAY && P.used < _MAX_COMBA {
+			redux = _private_montgomery_reduce_comba;
+		} else {
+			/*
+				Use slower baseline Montgomery method.
+			*/
+			redux = internal_int_montgomery_reduce;
+		}
+	} else if redmode == 1 {
+		/*
+		if (MP_HAS(MP_DR_SETUP) && MP_HAS(MP_DR_REDUCE)) {
+			/* setup DR reduction for moduli of the form B**k - b */
+			mp_dr_setup(P, &mp);
+			redux = mp_dr_reduce;
+		} else {
+			err = MP_VAL;
+			goto LBL_M;
+		}
+		*/
+		return .Unimplemented;
+	} else {
+		/*
+			Setup DR reduction for moduli of the form 2**k - b.
+		*/
+		rho = internal_int_reduce_2k_setup(P)                        or_return;
+		redux = internal_int_reduce_2k;
+	}
+
+	/*
+		Setup result.
+	*/
+	internal_grow(res, cap)                                          or_return;
+
+	/*
+		Create M table
+		The first half of the table is not computed, though, except for M[0] and M[1]
+	*/
+
+	if redmode == 0 {
+		/*
+			Now we need R mod m.
+		*/
+		internal_int_montgomery_calc_normalization(res, P)           or_return;
+
+		/*
+			Now set M[1] to G * R mod m.
+		*/
+		internal_mulmod(&M[1], G, res, P)                            or_return;
+	} else {
+		internal_one(res)                                            or_return;
+		internal_mod(&M[1], G, P)                                    or_return;
+	}
+
+	/*
+		Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
+	*/
+	slot := 1 << (winsize - 1);
+	internal_copy(&M[slot], &M[1])                                   or_return;
+
+	for x = 0; x < int(winsize - 1); x += 1 {
+		internal_sqr(&M[slot], &M[slot])                             or_return;
+   		print("slot: ", &M[slot]);
+		redux(&M[slot], P, rho)                                      or_return;
+		print("slot redux: ", &M[slot]);
+	}
+
+	/*
+		Create upper table.
+	*/
+	for x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x += 1 {
+		internal_mul(&M[x], &M[x - 1], &M[1])                        or_return;
+		redux(&M[x], P, rho)                                         or_return;
+	}
+
+	/*
+		Set initial mode and bit cnt.
+	*/
+	mode   := 0;
+	bitcnt := 1;
+	buf    := DIGIT(0);
+	digidx := X.used - 1;
+	bitcpy := 0;
+	bitbuf := DIGIT(0);
+
+	for {
+		/*
+			Grab next digit as required.
+		*/
+		bitcnt -= 1;
+		if bitcnt == 0 {
+			/*
+				If digidx == -1 we are out of digits so break.
+			*/
+			if digidx == -1 { break; }
+
+			/*
+				Read next digit and reset the bitcnt.
+			*/
+			buf    = X.digit[digidx];
+			digidx -= 1;
+			bitcnt = _DIGIT_BITS;
+		}
+
+		/*
+			Grab the next msb from the exponent.
+		*/
+		y := (buf >> (_DIGIT_BITS - 1)) & 1;
+		buf <<= 1;
+
+		/*
+			If the bit is zero and mode == 0 then we ignore it.
+			These represent the leading zero bits before the first 1 bit in the exponent.
+			Technically this opt is not required but it does lower the # of trivial squaring/reductions used.
+		*/
+		if mode == 0 && y == 0 { continue; }
+
+		/*
+			If the bit is zero and mode == 1 then we square.
+		*/
+		if mode == 1 && y == 0 {
+			internal_sqr(res, res)                                   or_return;
+			redux(res, P, rho)                                       or_return;
+			continue;
+		}
+
+		/*
+			Else we add it to the window.
+		*/
+		bitcpy += 1;
+		bitbuf |= (y << (winsize - uint(bitcpy)));
+		mode    = 2;
+
+		if bitcpy == int(winsize) {
+			/*
+				Window is filled so square as required and multiply
+				Square first.
+			*/
+			for x = 0; x < int(winsize); x += 1 {
+				internal_sqr(res, res)                               or_return;
+				redux(res, P, rho)                                   or_return;
+			}
+
+			/*
+				Then multiply.
+			*/
+			internal_mul(res, res, &M[bitbuf])                       or_return;
+			redux(res, P, rho)                                       or_return;
+
+			/*
+				Empty window and reset.
+			*/
+			bitcpy = 0;
+			bitbuf = 0;
+			mode   = 1;
+		}
+	}
+
+	/*
+		If bits remain then square/multiply.
+	*/
+	if mode == 2 && bitcpy > 0 {
+		/*
+			Square then multiply if the bit is set.
+		*/
+		for x = 0; x < bitcpy; x += 1 {
+			internal_sqr(res, res)                                   or_return;
+			redux(res, P, rho)                                       or_return;
+
+			/*
+				Get next bit of the window.
+			*/
+			bitbuf <<= 1;
+			if bitbuf & (1 << winsize) != 0 {
+				/*
+					Then multiply.
+				*/
+				internal_mul(res, res, &M[1])                        or_return;
+				redux(res, P, rho)                                   or_return;
+			}
+		}
+	}
+
+	if redmode == 0 {
+		/*
+			Fixup result if Montgomery reduction is used.
+			Recall that any value in a Montgomery system is actually multiplied by R mod n.
+			So we have to reduce one more time to cancel out the factor of R.
+		*/
+		redux(res, P, rho)                                           or_return;
+	}
+
+	return nil;
+}
+
 /*
 	Returns the number of Rabin-Miller trials needed for a given bit size.
 */
diff --git a/core/math/big/private.odin b/core/math/big/private.odin
index 7e839337f..552b100cf 100644
--- a/core/math/big/private.odin
+++ b/core/math/big/private.odin
@@ -1730,9 +1730,6 @@ _private_int_log :: proc(a: ^Int, base: DIGIT, allocator := context.allocator) -
 	return;
 }
 
-
-
-
 /*
 	Computes xR**-1 == x (mod N) via Montgomery Reduction.
 	This is an optimized implementation of `internal_montgomery_reduce`
@@ -1753,7 +1750,7 @@ _private_montgomery_reduce_comba :: proc(x, n: ^Int, rho: DIGIT, allocator := co
 	/*
 		Grow `x` as required.
 	*/
-	internal_grow(x, n.used + 1) or_return;
+	internal_grow(x, n.used + 1)                                     or_return;
 
 	/*
 		First we have to get the digits of the input into an array of double precision words W[...]

From ee04fb1ce1266ac82ee591e8ba0f9339f1038486 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Tue, 31 Aug 2021 22:00:20 +0200
Subject: [PATCH 06/33] big: Remove temporary prints.

---
 core/math/big/prime.odin | 2 --
 1 file changed, 2 deletions(-)

diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index 9771ef077..12d37269a 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -832,9 +832,7 @@ internal_int_exponent_mod_fast :: proc(res, G, X, P: ^Int, redmode: int, allocat
 
 	for x = 0; x < int(winsize - 1); x += 1 {
 		internal_sqr(&M[slot], &M[slot])                             or_return;
-   		print("slot: ", &M[slot]);
 		redux(&M[slot], P, rho)                                      or_return;
-		print("slot redux: ", &M[slot]);
 	}
 
 	/*

From 7d0dedf951b615835cd84819460f3f29a89a5c9b Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Tue, 31 Aug 2021 23:13:36 +0200
Subject: [PATCH 07/33] big: Add Diminished Radix reduction.

---
 core/math/big/example.odin |   2 +-
 core/math/big/prime.odin   | 114 ++++++++++++++++++++++++++++++++++---
 2 files changed, 106 insertions(+), 10 deletions(-)

diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index 4a64aaa01..ea4c83328 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -218,7 +218,7 @@ demo :: proc() {
 	set(b, 6);
 	set(c, 131);
 
-	if err := internal_int_exponent_mod_fast(res, a, b, c, 0); err != nil {
+	if err := internal_int_exponent_mod_fast(res, a, b, c, 1); err != nil {
 		fmt.printf("Error: %v\n", err);
 	}
 
diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index 12d37269a..027e13462 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -10,6 +10,8 @@
 */
 package math_big
 
+import "core:mem"
+
 /*
 	Determines if an Integer is divisible by one of the _PRIME_TABLE primes.
 	Returns true if it is, false if not. 
@@ -781,16 +783,10 @@ internal_int_exponent_mod_fast :: proc(res, G, X, P: ^Int, redmode: int, allocat
 		}
 	} else if redmode == 1 {
 		/*
-		if (MP_HAS(MP_DR_SETUP) && MP_HAS(MP_DR_REDUCE)) {
-			/* setup DR reduction for moduli of the form B**k - b */
-			mp_dr_setup(P, &mp);
-			redux = mp_dr_reduce;
-		} else {
-			err = MP_VAL;
-			goto LBL_M;
-		}
+			Setup DR reduction for moduli of the form B**k - b.
 		*/
-		return .Unimplemented;
+		rho = internal_int_dr_setup(P);
+		redux = internal_int_dr_reduce;
 	} else {
 		/*
 			Setup DR reduction for moduli of the form 2**k - b.
@@ -963,6 +959,106 @@ internal_int_exponent_mod_fast :: proc(res, G, X, P: ^Int, redmode: int, allocat
 	return nil;
 }
 
+/*
+	Determines the setup value.
+	Assumes `a` to not be `nil` and to have been initialized.
+*/
+internal_int_dr_setup :: proc(a: ^Int) -> (d: DIGIT) {
+	/*
+		The casts are required if _DIGIT_BITS is one less than
+		the number of bits in a DIGIT [e.g. _DIGIT_BITS==31].
+	*/
+	return DIGIT((1 << _DIGIT_BITS) - a.digit[0]);
+}
+
+/*
+	Determines if a number is a valid DR modulus.
+	Assumes `a` to not be `nil` and to have been initialized.
+*/
+internal_dr_is_modulus :: proc(a: ^Int) -> (res: bool) {
+	/*
+		Must be at least two digits.
+	*/
+	if a.used < 2 { return false; }
+
+	/*
+		Must be of the form b**k - a [a <= b] so all but the first digit must be equal to -1 (mod b).
+	*/
+	for ix := 1; ix < a.used; ix += 1 {
+		if a.digit[ix] != _MASK {
+			return false;
+		}
+	}
+	return true;
+}
+
+/*
+	Reduce "x" in place modulo "n" using the Diminished Radix algorithm.
+	Based on algorithm from the paper
+
+		"Generating Efficient Primes for Discrete Log Cryptosystems"
+					Chae Hoon Lim, Pil Joong Lee,
+			POSTECH Information Research Laboratories
+
+	The modulus must be of a special format [see manual].
+	Has been modified to use algorithm 7.10 from the LTM book instead
+
+	Input x must be in the range 0 <= x <= (n-1)**2
+	Assumes `x` and `n` to not be `nil` and to have been initialized.
+*/
+internal_int_dr_reduce :: proc(x, n: ^Int, k: DIGIT, allocator := context.allocator) -> (err: Error) {
+	/*
+		m = digits in modulus.
+	*/
+	m := n.used;
+
+	/*
+		Ensure that "x" has at least 2m digits.
+	*/
+	internal_grow(x, m + m)                                          or_return;
+
+	/*
+		Top of loop, this is where the code resumes if another reduction pass is required.
+	*/
+	for {
+		i: int;
+		mu := DIGIT(0);
+
+		/*
+			Compute (x mod B**m) + k * [x/B**m] inline and inplace.
+		*/
+		for i = 0; i < m; i += 1 {
+			r         := _WORD(x.digit[i + m]) * _WORD(k) + _WORD(x.digit[i] + mu);
+			x.digit[i] = DIGIT(r & _WORD(_MASK));
+			mu         = DIGIT(r >> _WORD(_DIGIT_BITS));
+		}
+
+		/*
+			Set final carry.
+		*/
+		x.digit[i] = mu;
+
+		/*
+			Zero words above m.
+		*/
+		mem.zero_slice(x.digit[m + 1:][:x.used - m]);
+
+		/*
+			Clamp, sub and return.
+		*/
+		internal_clamp(x)                                            or_return;
+
+		/*
+			If x >= n then subtract and reduce again.
+			Each successive "recursion" makes the input smaller and smaller.
+		*/
+		if internal_cmp_mag(x, n) == -1 { break; }
+
+		internal_sub(x, x, n)                                        or_return;
+	}
+	return nil;
+}
+
 /*
 	Returns the number of Rabin-Miller trials needed for a given bit size.
 */

From a056e1943435ed37330bc6d5c0eac241d323170b Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Wed, 1 Sep 2021 00:04:55 +0200
Subject: [PATCH 08/33] big: Cue up `internal_int_exponent_mod` wrapper
 function.

---
 core/math/big/example.odin |    2 +-
 core/math/big/prime.odin   | 1061 ++----------------------------------
 core/math/big/private.odin | 1009 ++++++++++++++++++++++++++++++++++
 3 files changed, 1067 insertions(+), 1005 deletions(-)

diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index ea4c83328..de0aed468 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -218,7 +218,7 @@ demo :: proc() {
 	set(b, 6);
 	set(c, 131);
 
-	if err := internal_int_exponent_mod_fast(res, a, b, c, 1); err != nil {
+	if err := _private_int_exponent_mod(res, a, b, c, 1); err != nil {
 		fmt.printf("Error: %v\n", err);
 	}
 
diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index 027e13462..edd6d9f3d 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -10,8 +10,6 @@
 */
 package math_big
 
-import "core:mem"
-
 /*
 	Determines if an Integer is divisible by one of the _PRIME_TABLE primes.
 	Returns true if it is, false if not. 
@@ -35,1027 +33,82 @@ internal_int_prime_is_divisible :: proc(a: ^Int, allocator := context.allocator)
 }
 
 /*
-	Computes xR**-1 == x (mod N) via Montgomery Reduction.
-*/
-internal_int_montgomery_reduce :: proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator;
-	/*
-		Can the fast reduction [comba] method be used?
-		Note that unlike in mul, you're safely allowed *less* than the available columns [255 per default],
-		since carries are fixed up in the inner loop.
-	*/
-	digs := (n.used * 2) + 1;
-	if digs < _WARRAY && x.used <= _WARRAY && n.used < _MAX_COMBA {
-		return _private_montgomery_reduce_comba(x, n, rho);
-	}
+	This is a shell function that calls either the normal or Montgomery exptmod functions.
+	Originally the call to the Montgomery code was embedded in the normal function but that
+	wasted alot of stack space for nothing (since 99% of the time the Montgomery code would be called).
 
-	/*
-		Grow the input as required
-	*/
-	internal_grow(x, digs)                                           or_return;
-	x.used = digs;
-
-	for ix := 0; ix < n.used; ix += 1 {
-		/*
-			`mu = ai * rho mod b`
-			The value of rho must be precalculated via `int_montgomery_setup()`,
-			such that it equals -1/n0 mod b this allows the following inner loop
-			to reduce the input one digit at a time.
-		*/
-
-		mu := DIGIT((_WORD(x.digit[ix]) * _WORD(rho)) & _WORD(_MASK));
-
-		/*
-			a = a + mu * m * b**i
-			Multiply and add in place.
-		*/
-		u  := DIGIT(0);
-		iy := int(0);
-		for ; iy < n.used; iy += 1 {
-			/*
-				Compute product and sum.
-			*/
-			r := (_WORD(mu) * _WORD(n.digit[iy]) + _WORD(u) + _WORD(x.digit[ix + iy]));
-
-			/*
-				Get carry.
-			*/
-			u = DIGIT(r >> _DIGIT_BITS);
-
-			/*
-				Fix digit.
-			*/
-			x.digit[ix + iy] = DIGIT(r & _WORD(_MASK));
-		}
-
-		/*
-			At this point the ix'th digit of x should be zero.
-			Propagate carries upwards as required.
-		*/
-		for u != 0 {
-			x.digit[ix + iy] += u;
-			u = x.digit[ix + iy] >> _DIGIT_BITS;
-			x.digit[ix + iy] &= _MASK;
-			iy += 1;
-		}
-	}
-
-	/*
-		At this point the n.used'th least significant digits of x are all zero,
-		which means we can shift x to the right by n.used digits and the
-		residue is unchanged.
-
-		x = x/b**n.used.
-	*/
-	internal_clamp(x);
-	internal_shr_digit(x, n.used);
-
-	/*
-		if x >= n then x = x - n
-	*/
-	if internal_cmp_mag(x, n) != -1 {
-		return internal_sub(x, x, n);
-	}
-
-	return nil;
-}
-
-int_montgomery_reduce :: proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error) {
-	assert_if_nil(x, n);
-	context.allocator = allocator;
-
-	internal_clear_if_uninitialized(x, n) or_return;
-
-	return #force_inline internal_int_montgomery_reduce(x, n, rho);
-}
-
-/*
-	Shifts with subtractions when the result is greater than b.
-
-	The method is slightly modified to shift B unconditionally upto just under
-	the leading bit of b.  This saves alot of multiple precision shifting.
-*/
-internal_int_montgomery_calc_normalization :: proc(a, b: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator;
-	/*
-		How many bits of last digit does b use.
-	*/
-	bits := internal_count_bits(b) % _DIGIT_BITS;
-
-	if b.used > 1 {
-		power := ((b.used - 1) * _DIGIT_BITS) + bits - 1;
-		internal_int_power_of_two(a, power)                          or_return;
-	} else {
-		internal_one(a)                                              or_return;
-		bits = 1;
-	}
-
-	/*
-		Now compute C = A * B mod b.
-	*/
-	for x := bits - 1; x < _DIGIT_BITS; x += 1 {
-		internal_int_shl1(a, a)                                      or_return;
-		if internal_cmp_mag(a, b) != -1 {
-			internal_sub(a, a, b)                                    or_return;
-		}
-	}
-	return nil;
-}
-
-int_montgomery_calc_normalization :: proc(a, b: ^Int, allocator := context.allocator) -> (err: Error) {
-	assert_if_nil(a, b);
-	context.allocator = allocator;
-
-	internal_clear_if_uninitialized(a, b) or_return;
-
-	return #force_inline internal_int_montgomery_calc_normalization(a, b);
-}
-
-/*
-	Sets up the Montgomery reduction stuff.
-*/
-internal_int_montgomery_setup :: proc(n: ^Int) -> (rho: DIGIT, err: Error) {
-	/*
-		Fast inversion mod 2**k
-		Based on the fact that:
-
-		XA = 1 (mod 2**n) => (X(2-XA)) A = 1 (mod 2**2n)
-		                  =>  2*X*A - X*X*A*A = 1
-		                  =>  2*(1) - (1)     = 1
-	*/
-	b := n.digit[0];
-	if b & 1 == 0 { return 0, .Invalid_Argument; }
-
-	x := (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
-	x *= 2 - (b * x);              /* here x*a==1 mod 2**8 */
-	x *= 2 - (b * x);              /* here x*a==1 mod 2**16 */
-
-	when _DIGIT_TYPE_BITS == 64 {
-		x *= 2 - (b * x);              /* here x*a==1 mod 2**32 */
-		x *= 2 - (b * x);              /* here x*a==1 mod 2**64 */
-	}
-
-	/*
-		rho = -1/m mod b
-	*/
-	rho = DIGIT(((_WORD(1) << _WORD(_DIGIT_BITS)) - _WORD(x)) & _WORD(_MASK));
-	return rho, nil;
-}
-
-int_montgomery_setup :: proc(n: ^Int, allocator := context.allocator) -> (rho: DIGIT, err: Error) {
-	assert_if_nil(n);
-	internal_clear_if_uninitialized(n, allocator) or_return;
-
-	return #force_inline internal_int_montgomery_setup(n);
-}
-
-/*
-	Reduces `x` mod `m`, assumes 0 < x < m**2, mu is precomputed via reduce_setup.
-	From HAC pp.604 Algorithm 14.42
-
-	Assumes `x`, `m` and `mu` all not to be `nil` and have been initialized.
-*/
-internal_int_reduce :: proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator;
-
-	q := &Int{};
-	defer internal_destroy(q);
-	um := m.used;
-
-	/*
-		q = x
-	*/
-	internal_copy(q, x)                                              or_return;
-
-	/*
-		q1 = x / b**(k-1)
-	*/
-	internal_shr_digit(q, um - 1);
-
-	/*
-		According to HAC this optimization is ok.
-	*/
-	if DIGIT(um) > DIGIT(1) << (_DIGIT_BITS - 1) {
-		internal_mul(q, q, mu)                                       or_return;
-	} else {
-		_private_int_mul_high(q, q, mu, um)                          or_return;
-	}
-
-	/*
-		q3 = q2 / b**(k+1)
-	*/
-	internal_shr_digit(q, um + 1);
-
-	/*
-		x = x mod b**(k+1), quick (no division)
-	*/
-	internal_int_mod_bits(x, x, _DIGIT_BITS * (um + 1))              or_return;
-
-	/*
-		q = q * m mod b**(k+1), quick (no division)
-	*/
-	_private_int_mul(q, q, m, um + 1)                                or_return;
-
-	/*
-		x = x - q
-	*/
-	internal_sub(x, x, q)                                            or_return;
-
-	/*
-		If x < 0, add b**(k+1) to it.
-	*/
-	if internal_cmp(x, 0) == -1 {
-		internal_set(q, 1)                                           or_return;
-		internal_shl_digit(q, um + 1)                                or_return;
-		internal_add(x, x, q)                                        or_return;
-	}
-
-	/*
-		Back off if it's too big.
-	*/
-	for internal_cmp(x, m) != -1 {
-		internal_sub(x, x, m)                                        or_return;
-	}
-
-	return nil;
-}
-
-/*
-	Reduces `a` modulo `n`, where `n` is of the form 2**p - d.
-*/
-internal_int_reduce_2k :: proc(a, n: ^Int, d: DIGIT, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator;
-
-	q := &Int{};
-	defer internal_destroy(q);
-
-	internal_zero(q)                                                 or_return;
-
-	p := internal_count_bits(n);
-
-	for {
-		/*
-			q = a/2**p, a = a mod 2**p
-		*/
-		internal_shrmod(q, a, a, p)                                  or_return;
-
-		if d != 1 {
-			/*
-				q = q * d
-			*/
-			internal_mul(q, q, d)                                    or_return;
-		}
-
-		/*
-			a = a + q
-		*/
-		internal_add(a, a, q)                                        or_return;
-		if internal_cmp_mag(a, n) == -1                              { break; }
-		internal_sub(a, a, n)                                        or_return;
-	}
-
-	return nil;
-}
-
-/*
-	Reduces `a` modulo `n` where `n` is of the form 2**p - d
-	This differs from reduce_2k since "d" can be larger than a single digit.
-*/
-internal_int_reduce_2k_l :: proc(a, n, d: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator;
-
-	q := &Int{};
-	defer internal_destroy(q);
-
-	internal_zero(q)                                                 or_return;
-
-	p := internal_count_bits(n);
-
-	for {
-		/*
-			q = a/2**p, a = a mod 2**p
-		*/
-		internal_shrmod(q, a, a, p)                                  or_return;
-
-		/*
-			q = q * d
-		*/
-		internal_mul(q, q, d)                                        or_return;
-
-		/*
-			a = a + q
-		*/
-		internal_add(a, a, q)                                        or_return;
-		if internal_cmp_mag(a, n) == -1                              { break; }
-		internal_sub(a, a, n)                                        or_return;
-	}
-
-	return nil;
-}
-
-/*
-	Determines if `internal_int_reduce_2k` can be used.
-	Asssumes `a` not to be `nil` and to have been initialized.
-*/
-internal_int_reduce_is_2k :: proc(a: ^Int) -> (reducible: bool, err: Error) {
-	assert_if_nil(a);
-
-	if internal_is_zero(a) {
-		return false, nil;
-	} else if a.used == 1 {
-		return true, nil;
-	} else if a.used  > 1 {
-		iy := internal_count_bits(a);
-		iw := 1;
-		iz := DIGIT(1);
-
-		/*
-			Test every bit from the second digit up, must be 1.
-		*/
-		for ix := _DIGIT_BITS; ix < iy; ix += 1 {
-			if a.digit[iw] & iz == 0 {
-				return false, nil;
-			}
-
-			iz <<= 1;
-			if iz > _DIGIT_MAX {
-				iw += 1;
-				iz  = 1;
-			}
-		}
-		return true, nil;
-	} else {
-		return true, nil;
-	}
-}
-
-/*
-	Determines if `internal_int_reduce_2k_l` can be used.
-	Asssumes `a` not to be `nil` and to have been initialized.
-*/
-internal_int_reduce_is_2k_l :: proc(a: ^Int) -> (reducible: bool, err: Error) {
-	assert_if_nil(a);
-
-	if internal_int_is_zero(a) {
-		return false, nil;
-	} else if a.used == 1 {
-		return true, nil;
-	} else if a.used  > 1 {
-		/*
-			If more than half of the digits are -1 we're sold.
-		*/
-		ix := 0;
-		iy := 0;
-
-		for ; ix < a.used; ix += 1 {
-			if a.digit[ix] == _DIGIT_MAX {
-				iy += 1;
-			}
-		}
-		return iy >= (a.used / 2), nil;
-	} else {
-		return false, nil;
-	}
-}
-
-/*
-	Determines the setup value.
-	Assumes `a` is not `nil`.
-*/
-internal_int_reduce_2k_setup :: proc(a: ^Int, allocator := context.allocator) -> (d: DIGIT, err: Error) {
-	context.allocator = allocator;
-
-	tmp := &Int{};
-	defer internal_destroy(tmp);
-	internal_zero(tmp)                                               or_return;
-
-	internal_int_power_of_two(tmp, internal_count_bits(a))           or_return;
-	internal_sub(tmp, tmp, a)                                        or_return;
-
-	return tmp.digit[0], nil;
-}
-
-/*
-	Determines the setup value.
-	Assumes `mu` and `P` are not `nil`.
-
-	d := (1 << a.bits) - a;
-*/
-internal_int_reduce_2k_setup_l :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator;
-
-	tmp := &Int{};
-	defer internal_destroy(tmp);
-	internal_zero(tmp)                                               or_return;
-
-	internal_int_power_of_two(tmp, internal_count_bits(P))           or_return;
-	internal_sub(mu, tmp, P)                                         or_return;
-
-	return nil;
-}
-
-/*
-	Pre-calculate the value required for Barrett reduction.
-	For a given modulus "P" it calulates the value required in "mu"
-	Assumes `mu` and `P` are not `nil`.
-*/
-internal_int_reduce_setup :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator;
-
-	internal_int_power_of_two(mu, P.used * 2 * _DIGIT_BITS)           or_return;
-	return internal_int_div(mu, mu, P);
-}
-
-/*
 	Computes res == G**X mod P.
 	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
 */
-internal_int_exponent_mod :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
+internal_int_exponent_mod :: proc(res, G, X, P: ^Int, allocator := context.allocator) -> (err: Error) {
 	context.allocator = allocator;
 
-	M := [_TAB_SIZE]Int{};
-	winsize: uint;
-
-	/*
-		Use a pointer to the reduction algorithm.
-		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
-	*/
-	redux: #type proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error);
-
-	defer {
-		internal_destroy(&M[1]);
-		for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
-			internal_destroy(&M[x]);
-		}
-	}
-
-	/*
-		Find window size.
-	*/
-	x := internal_count_bits(X);
-	switch {
-	case x <= 7:
-		winsize = 2;
-	case x <= 36:
-		winsize = 3;
-	case x <= 140:
-		winsize = 4;
-	case x <= 450:
-		winsize = 5;
-	case x <= 1303:
-		winsize = 6;
-	case x <= 3529:
-		winsize = 7;
-	case:
-		winsize = 8;
-	}
-
-	winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize;
-
-	/*
-		Init M array.
-		Init first cell.
-	*/
-	internal_zero(&M[1])                                             or_return;
-
-	/*
-		Now init the second half of the array.
-	*/
-	for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
-		internal_zero(&M[x])                                         or_return;
-	}
-
-	/*
-		Create `mu`, used for Barrett reduction.
-	*/
-	mu := &Int{};
-	defer internal_destroy(mu);
-	internal_zero(mu)                                                or_return;
-
-	if redmode == 0 {
-		internal_int_reduce_setup(mu, P)                             or_return;
-		redux = internal_int_reduce;
-	} else {
-		internal_int_reduce_2k_setup_l(mu, P)                        or_return;
-		redux = internal_int_reduce_2k_l;
-	}
-
-	/*
-		Create M table.
-
-		The M table contains powers of the base, e.g. M[x] = G**x mod P.
-		The first half of the table is not computed, though, except for M[0] and M[1].
-	*/
-	internal_int_mod(&M[1], G, P)                                    or_return;
-
-	/*
-		Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
-
-		TODO: This can probably be replaced by computing the power and using `pow` to raise to it
-		instead of repeated squaring.
-	*/
-	slot := 1 << (winsize - 1);
-	internal_copy(&M[slot], &M[1])                                   or_return;
-
-	for x = 0; x < int(winsize - 1); x += 1 {
-		/*
-			Square it.
-		*/
-		internal_sqr(&M[slot], &M[slot])                             or_return;
-
-		/*
-			Reduce modulo P
-		*/
-		redux(&M[slot], P, mu)                                       or_return;
-	}
-
-	/*
-		Create upper table, that is M[x] = M[x-1] * M[1] (mod P)
-		for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
-	*/
-	for x = slot + 1; x < (1 << winsize); x += 1 {
-		internal_mul(&M[x], &M[x - 1], &M[1])                        or_return;
-		redux(&M[x], P, mu)                                          or_return;
-	}
-
-	/*
-		Setup result.
-	*/
-	internal_one(res)                                                or_return;
-
-	/*
-		Set initial mode and bit cnt.
-	*/
-	mode   := 0;
-	bitcnt := 1;
-	buf    := DIGIT(0);
-	digidx := X.used - 1;
-	bitcpy := uint(0);
-	bitbuf := DIGIT(0);
-
-	for {
-		/*
-			Grab next digit as required.
-		*/
-		bitcnt -= 1;
-		if bitcnt == 0 {
-			/*
-				If digidx == -1 we are out of digits.
-			*/
-			if digidx == -1 { break; }
-
-			/*
-				Read next digit and reset the bitcnt.
-			*/
-			buf    = X.digit[digidx];
-			digidx -= 1;
-			bitcnt = _DIGIT_BITS;
-		}
-
-		/*
-			Grab the next msb from the exponent.
-		*/
-		y := buf >> (_DIGIT_BITS - 1) & 1;
-		buf <<= 1;
-
-		/*
-			If the bit is zero and mode == 0 then we ignore it.
-			These represent the leading zero bits before the first 1 bit
-			in the exponent.  Technically this opt is not required but it
-			does lower the # of trivial squaring/reductions used.
-		*/
-		if mode == 0 && y == 0 {
-			continue;
-		}
-
-		/*
-			If the bit is zero and mode == 1 then we square.
-		*/
-		if mode == 1 && y == 0 {
-			internal_sqr(res, res)                                   or_return;
-			redux(res, P, mu)                                        or_return;
-			continue;
-		}
-
-		/*
-			Else we add it to the window.
-		*/
-		bitcpy += 1;
-		bitbuf |= (y << (winsize - bitcpy));
-		mode    = 2;
-
-		if (bitcpy == winsize) {
-			/*
-				Window is filled so square as required and multiply.
-				Square first.
-			*/
-			for x = 0; x < int(winsize); x += 1 {
-				internal_sqr(res, res)                               or_return;
-				redux(res, P, mu)                                    or_return;
-			}
-
-			/*
-				Then multiply.
-			*/
-			internal_mul(res, res, &M[bitbuf])                       or_return;
-			redux(res, P, mu)                                        or_return;
-
-			/*
-				Empty window and reset.
-			*/
-			bitcpy = 0;
-			bitbuf = 0;
-			mode   = 1;
-		}
-	}
-
-	/*
-		If bits remain then square/multiply.
-	*/
-	if mode == 2 && bitcpy > 0 {
-		/*
-			Square then multiply if the bit is set.
-		*/
-		for x = 0; x < int(bitcpy); x += 1 {
-			internal_sqr(res, res)                                   or_return;
-			redux(res, P, mu)                                        or_return;
-
-			bitbuf <<= 1;
-			if ((bitbuf & (1 << winsize)) != 0) {
-				/*
-					Then multiply.
-				*/
-				internal_mul(res, res, &M[1])                        or_return;
-				redux(res, P, mu)                                    or_return;
-			}
-		}
-	}
-	return err;
-}
-
 /*
-	Computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
+	int dr;
 
-	Uses a left-to-right `k`-ary sliding window to compute the modular exponentiation.
-	The value of `k` changes based on the size of the exponent.
-
-	Uses Montgomery or Diminished Radix reduction [whichever appropriate]
-
-	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
-*/
-internal_int_exponent_mod_fast :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator;
-
-	M := [_TAB_SIZE]Int{};
-	winsize: uint;
-
-	/*
-		Use a pointer to the reduction algorithm.
-		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
-	*/
-	redux: #type proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error);
-
-	defer {
-		internal_destroy(&M[1]);
-		for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
-			internal_destroy(&M[x]);
-		}
+	/* modulus P must be positive */
+	if (mp_isneg(P)) {
+		return MP_VAL;
 	}
 
-	/*
-		Find window size.
-	*/
-	x := internal_count_bits(X);
-	switch {
-	case x <= 7:
-		winsize = 2;
-	case x <= 36:
-		winsize = 3;
-	case x <= 140:
-		winsize = 4;
-	case x <= 450:
-		winsize = 5;
-	case x <= 1303:
-		winsize = 6;
-	case x <= 3529:
-		winsize = 7;
-	case:
-		winsize = 8;
-	}
+	/* if exponent X is negative we have to recurse */
+	if (mp_isneg(X)) {
+		mp_int tmpG, tmpX;
+		mp_err err;
 
-	winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize;
-
-	/*
-		Init M array
-		Init first cell.
-	*/
-	cap := internal_int_allocated_cap(P);
-	internal_grow(&M[1], cap)                                        or_return;
-
-	/*
-		Now init the second half of the array.
-	*/
-	for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
-		internal_grow(&M[x], cap)                                    or_return;
-	}
-
-	/*
-		Determine and setup reduction code.
-	*/
-	rho: DIGIT;
-
-	if redmode == 0 {
-		/*
-			Now setup Montgomery.
-		*/
-		rho = internal_int_montgomery_setup(P)                       or_return;
-
-		/*
-			Automatically pick the comba one if available (saves quite a few calls/ifs).
-		*/
-		if ((P.used * 2) + 1) < _WARRAY && P.used < _MAX_COMBA {
-			redux = _private_montgomery_reduce_comba;
-		} else {
-			/*
-				Use slower baseline Montgomery method.
-			*/
-			redux = internal_int_montgomery_reduce;
-		}
-	} else if redmode == 1 {
-		/*
-			Setup DR reduction for moduli of the form B**k - b.
-		*/
-		rho = internal_int_dr_setup(P);
-		redux = internal_int_dr_reduce;
-	} else {
-		/*
-			Setup DR reduction for moduli of the form 2**k - b.
-		*/
-		rho = internal_int_reduce_2k_setup(P)                        or_return;
-		redux = internal_int_reduce_2k;
-	}
-
-	/*
-		Setup result.
-	*/
-	internal_grow(res, cap)                                          or_return;
-
-	/*
-		Create M table
-		The first half of the table is not computed, though, except for M[0] and M[1]
-	*/
-
-	if redmode == 0 {
-		/*
-			Now we need R mod m.
-		*/
-		internal_int_montgomery_calc_normalization(res, P)           or_return;
-
-		/*
-			Now set M[1] to G * R mod m.
-		*/
-		internal_mulmod(&M[1], G, res, P)                            or_return;
-	} else {
-		internal_one(res)                                            or_return;
-		internal_mod(&M[1], G, P)                                    or_return;
-	}
-
-	/*
-		Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
-	*/
-	slot := 1 << (winsize - 1);
-	internal_copy(&M[slot], &M[1])                                   or_return;
-
-	for x = 0; x < int(winsize - 1); x += 1 {
-		internal_sqr(&M[slot], &M[slot])                             or_return;
-		redux(&M[slot], P, rho)                                      or_return;
-	}
-
-	/*
-		Create upper table.
-	*/
-	for x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x += 1 {
-		internal_mul(&M[x], &M[x - 1], &M[1])                        or_return;
-		redux(&M[x], P, rho)                                         or_return;
-	}
-
-	/*
-		Set initial mode and bit cnt.
-	*/
-	mode   := 0;
-	bitcnt := 1;
-	buf    := DIGIT(0);
-	digidx := X.used - 1;
-	bitcpy := 0;
-	bitbuf := DIGIT(0);
-
-	for {
-		/*
-			Grab next digit as required.
-		*/
-		bitcnt -= 1;
-		if bitcnt == 0 {
-			/*
-				If digidx == -1 we are out of digits so break.
-			*/
-			if digidx == -1 { break; }
-
-			/*
-				Read next digit and reset the bitcnt.
-			*/
-			buf    = X.digit[digidx];
-			digidx -= 1;
-			bitcnt = _DIGIT_BITS;
+		if (!MP_HAS(MP_INVMOD)) {
+			return MP_VAL;
 		}
 
-		/*
-			Grab the next msb from the exponent.
-		*/
-		y := (buf >> (_DIGIT_BITS - 1)) & 1;
-		buf <<= 1;
-
-		/*
-			If the bit is zero and mode == 0 then we ignore it.
-			These represent the leading zero bits before the first 1 bit in the exponent.
-			Technically this opt is not required but it does lower the # of trivial squaring/reductions used.
-		*/
-		if mode == 0 && y == 0 { continue; }
-
-		/*
-			If the bit is zero and mode == 1 then we square.
-		*/
-		if mode == 1 && y == 0 {
-			internal_sqr(res, res)                                   or_return;
-			redux(res, P, rho)                                       or_return;
-			continue;
+		if ((err = mp_init_multi(&tmpG, &tmpX, NULL)) != MP_OKAY) {
+			return err;
 		}
 
-		/*
-			Else we add it to the window.
-		*/
-		bitcpy += 1;
-		bitbuf |= (y << (winsize - uint(bitcpy)));
-		mode    = 2;
-
-		if bitcpy == int(winsize) {
-			/*
-				Window is filled so square as required and multiply
-				Square first.
-			*/
-			for x = 0; x < int(winsize); x += 1 {
-				internal_sqr(res, res)                               or_return;
-				redux(res, P, rho)                                   or_return;
-			}
-
-			/*
-				Then multiply.
-			*/
-			internal_mul(res, res, &M[bitbuf])                       or_return;
-			redux(res, P, rho)                                       or_return;
-
-			/*
-				Empty window and reset.
-			*/
-			bitcpy = 0;
-			bitbuf = 0;
-			mode   = 1;
-		}
-	}
-
-	/*
-		If bits remain then square/multiply.
-	*/
-	if mode == 2 && bitcpy > 0 {
-		/*
-			Square then multiply if the bit is set.
-		*/
-		for x = 0; x < bitcpy; x += 1 {
-			internal_sqr(res, res)                                   or_return;
-			redux(res, P, rho)                                       or_return;
-
-			/*
-				Get next bit of the window.
-			*/
-			bitbuf <<= 1;
-			if bitbuf & (1 << winsize) != 0 {
-				/*
-					Then multiply.
-				*/
-				internal_mul(res, res, &M[1])                        or_return;
-				redux(res, P, rho)                                   or_return;
-			}
-		}
-	}
-
-	if redmode == 0 {
-		/*
-			Fixup result if Montgomery reduction is used.
-			Recall that any value in a Montgomery system is actually multiplied by R mod n.
-			So we have to reduce one more time to cancel out the factor of R.
-		*/
-		redux(res, P, rho)                                           or_return;
-	}
-
-	return nil;
-}
-
-/*
-	Determines the setup value.
-	Assumes `a` to not be `nil` and to have been initialized.
-*/
-internal_int_dr_setup :: proc(a: ^Int) -> (d: DIGIT) {
-	/*
-		The casts are required if _DIGIT_BITS is one less than
-		the number of bits in a DIGIT [e.g. _DIGIT_BITS==31].
-	*/
-	return DIGIT((1 << _DIGIT_BITS) - a.digit[0]);
-}
-
-/*
-	Determines if a number is a valid DR modulus.
-	Assumes `a` to not be `nil` and to have been initialized.
-*/
-internal_dr_is_modulus :: proc(a: ^Int) -> (res: bool) {
-	/*
-		Must be at least two digits.
-	*/
-	if a.used < 2 { return false; }
-
-	/*
-		Must be of the form b**k - a [a <= b] so all but the first digit must be equal to -1 (mod b).
-	*/
-	for ix := 1; ix < a.used; ix += 1 {
-		if a.digit[ix] != _MASK {
-			return false;
-		}
-	}
-	return true;
-}
-
-/*
-	Reduce "x" in place modulo "n" using the Diminished Radix algorithm.
-	Based on algorithm from the paper
-
-		"Generating Efficient Primes for Discrete Log Cryptosystems"
-					Chae Hoon Lim, Pil Joong Lee,
-			POSTECH Information Research Laboratories
-
-	The modulus must be of a special format [see manual].
-	Has been modified to use algorithm 7.10 from the LTM book instead
-
-	Input x must be in the range 0 <= x <= (n-1)**2
-	Assumes `x` and `n` to not be `nil` and to have been initialized.
-*/
-internal_int_dr_reduce :: proc(x, n: ^Int, k: DIGIT, allocator := context.allocator) -> (err: Error) {
-	/*
-		m = digits in modulus.
-	*/
-	m := n.used;
-
-	/*
-		Ensure that "x" has at least 2m digits.
-	*/
-	internal_grow(x, m + m)                                          or_return;
-
-	/*
-		Top of loop, this is where the code resumes if another reduction pass is required.
-	*/
-	for {
-		i: int;
-		mu := DIGIT(0);
-
-		/*
-			Compute (x mod B**m) + k * [x/B**m] inline and inplace.
-		*/
-		for i = 0; i < m; i += 1 {
-			r         := _WORD(x.digit[i + m]) * _WORD(k) + _WORD(x.digit[i] + mu);
-			x.digit[i] = DIGIT(r & _WORD(_MASK));
-			mu         = DIGIT(r >> _WORD(_DIGIT_BITS));
+		/* first compute 1/G mod P */
+		if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) {
+			goto LBL_ERR;
 		}
 
-		/*
-			Set final carry.
-		*/
-		x.digit[i] = mu;
+		/* now get |X| */
+		if ((err = mp_abs(X, &tmpX)) != MP_OKAY) {
+			goto LBL_ERR;
+		}
 
-		/*
-			Zero words above m.
-		*/
-		mem.zero_slice(x.digit[m + 1:][:x.used - m]);
-
-		/*
-			Clamp, sub and return.
-		*/
-		internal_clamp(x)                                            or_return;
-
-		/*
-			If x >= n then subtract and reduce again.
-			Each successive "recursion" makes the input smaller and smaller.
-		*/
-		if internal_cmp_mag(x, n) == -1 { break; }
-
-		internal_sub(x, x, n)                                        or_return;
+		/* and now compute (1/G)**|X| instead of G**X [X < 0] */
+		err = mp_exptmod(&tmpG, &tmpX, P, Y);
+LBL_ERR:
+		mp_clear_multi(&tmpG, &tmpX, NULL);
+		return err;
 	}
+
+	/* modified diminished radix reduction */
+	if (MP_HAS(MP_REDUCE_IS_2K_L) && MP_HAS(MP_REDUCE_2K_L) && MP_HAS(S_MP_EXPTMOD) &&
+		 mp_reduce_is_2k_l(P)) {
+		return s_mp_exptmod(G, X, P, Y, 1);
+	}
+
+	/* is it a DR modulus? default to no */
+	dr = (MP_HAS(MP_DR_IS_MODULUS) && mp_dr_is_modulus(P)) ? 1 : 0;
+
+	/* if not, is it a unrestricted DR modulus? */
+	if (MP_HAS(MP_REDUCE_IS_2K) && (dr == 0)) {
+		dr = (mp_reduce_is_2k(P)) ? 2 : 0;
+	}
+
+	/* if the modulus is odd or dr != 0 use the montgomery method */
+	if (MP_HAS(S_MP_EXPTMOD_FAST) && (mp_isodd(P) || (dr != 0))) {
+		return s_mp_exptmod_fast(G, X, P, Y, dr);
+	}
+
+	/* otherwise use the generic Barrett reduction technique */
+	if (MP_HAS(S_MP_EXPTMOD)) {
+		return s_mp_exptmod(G, X, P, Y, 0);
+	}
+
+	/* no exptmod for evens */
+	return MP_VAL;
+
+	*/
 	return nil;
 }
 
diff --git a/core/math/big/private.odin b/core/math/big/private.odin
index 552b100cf..d2878fcc1 100644
--- a/core/math/big/private.odin
+++ b/core/math/big/private.odin
@@ -1850,6 +1850,1015 @@ _private_montgomery_reduce_comba :: proc(x, n: ^Int, rho: DIGIT, allocator := co
 	return nil;
 }
 
+/*
+	Computes xR**-1 == x (mod N) via Montgomery Reduction.
+	Assumes `x` and `n` not to be nil.
+*/
+_private_int_montgomery_reduce :: proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+	/*
+		Can the fast reduction [comba] method be used?
+		Note that unlike in mul, you're safely allowed *less* than the available columns [255 per default],
+		since carries are fixed up in the inner loop.
+	*/
+	internal_clear_if_uninitialized(x, n) or_return;
+
+	digs := (n.used * 2) + 1;
+	if digs < _WARRAY && x.used <= _WARRAY && n.used < _MAX_COMBA {
+		return _private_montgomery_reduce_comba(x, n, rho);
+	}
+
+	/*
+		Grow the input as required
+	*/
+	internal_grow(x, digs)                                           or_return;
+	x.used = digs;
+
+	for ix := 0; ix < n.used; ix += 1 {
+		/*
+			`mu = ai * rho mod b`
+			The value of rho must be precalculated via `int_montgomery_setup()`,
+			such that it equals -1/n0 mod b this allows the following inner loop
+			to reduce the input one digit at a time.
+		*/
+
+		mu := DIGIT((_WORD(x.digit[ix]) * _WORD(rho)) & _WORD(_MASK));
+
+		/*
+			a = a + mu * m * b**i
+			Multiply and add in place.
+		*/
+		u  := DIGIT(0);
+		iy := int(0);
+		for ; iy < n.used; iy += 1 {
+			/*
+				Compute product and sum.
+			*/
+			r := (_WORD(mu) * _WORD(n.digit[iy]) + _WORD(u) + _WORD(x.digit[ix + iy]));
+
+			/*
+				Get carry.
+			*/
+			u = DIGIT(r >> _DIGIT_BITS);
+
+			/*
+				Fix digit.
+			*/
+			x.digit[ix + iy] = DIGIT(r & _WORD(_MASK));
+		}
+
+		/*
+			At this point the ix'th digit of x should be zero.
+			Propagate carries upwards as required.
+		*/
+		for u != 0 {
+			x.digit[ix + iy] += u;
+			u = x.digit[ix + iy] >> _DIGIT_BITS;
+			x.digit[ix + iy] &= _MASK;
+			iy += 1;
+		}
+	}
+
+	/*
+		At this point the n.used'th least significant digits of x are all zero,
+		which means we can shift x to the right by n.used digits and the
+		residue is unchanged.
+
+		x = x/b**n.used.
+	*/
+	internal_clamp(x);
+	internal_shr_digit(x, n.used);
+
+	/*
+		if x >= n then x = x - n
+	*/
+	if internal_cmp_mag(x, n) != -1 {
+		return internal_sub(x, x, n);
+	}
+
+	return nil;
+}
+
+/*
+	Shifts with subtractions when the result is greater than b.
+
+	The method is slightly modified to shift B unconditionally upto just under
+	the leading bit of b.  This saves alot of multiple precision shifting.
+
+	Assumes `a` and `b` not to be `nil`.
+*/
+_private_int_montgomery_calc_normalization :: proc(a, b: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+	/*
+		How many bits of last digit does b use.
+	*/
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	bits := internal_count_bits(b) % _DIGIT_BITS;
+
+	if b.used > 1 {
+		power := ((b.used - 1) * _DIGIT_BITS) + bits - 1;
+		internal_int_power_of_two(a, power)                          or_return;
+	} else {
+		internal_one(a)                                              or_return;
+		bits = 1;
+	}
+
+	/*
+		Now compute C = A * B mod b.
+	*/
+	for x := bits - 1; x < _DIGIT_BITS; x += 1 {
+		internal_int_shl1(a, a)                                      or_return;
+		if internal_cmp_mag(a, b) != -1 {
+			internal_sub(a, a, b)                                    or_return;
+		}
+	}
+	return nil;
+}
+
+/*
+	Sets up the Montgomery reduction stuff.
+*/
+_private_int_montgomery_setup :: proc(n: ^Int, allocator := context.allocator) -> (rho: DIGIT, err: Error) {
+	/*
+		Fast inversion mod 2**k
+		Based on the fact that:
+
+		XA = 1 (mod 2**n) => (X(2-XA)) A = 1 (mod 2**2n)
+		                  =>  2*X*A - X*X*A*A = 1
+		                  =>  2*(1) - (1)     = 1
+	*/
+	internal_clear_if_uninitialized(n, allocator) or_return;
+
+	b := n.digit[0];
+	if b & 1 == 0 { return 0, .Invalid_Argument; }
+
+	x := (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
+	x *= 2 - (b * x);              /* here x*a==1 mod 2**8 */
+	x *= 2 - (b * x);              /* here x*a==1 mod 2**16 */
+
+	when _DIGIT_TYPE_BITS == 64 {
+		x *= 2 - (b * x);              /* here x*a==1 mod 2**32 */
+		x *= 2 - (b * x);              /* here x*a==1 mod 2**64 */
+	}
+
+	/*
+		rho = -1/m mod b
+	*/
+	rho = DIGIT(((_WORD(1) << _WORD(_DIGIT_BITS)) - _WORD(x)) & _WORD(_MASK));
+	return rho, nil;
+}
+
+/*
+	Reduces `x` mod `m`, assumes 0 < x < m**2, mu is precomputed via reduce_setup.
+	From HAC pp.604 Algorithm 14.42
+
+	Assumes `x`, `m` and `mu` all not to be `nil` and have been initialized.
+*/
+_private_int_reduce :: proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	q := &Int{};
+	defer internal_destroy(q);
+	um := m.used;
+
+	/*
+		q = x
+	*/
+	internal_copy(q, x)                                              or_return;
+
+	/*
+		q1 = x / b**(k-1)
+	*/
+	internal_shr_digit(q, um - 1);
+
+	/*
+		According to HAC this optimization is ok.
+	*/
+	if DIGIT(um) > DIGIT(1) << (_DIGIT_BITS - 1) {
+		internal_mul(q, q, mu)                                       or_return;
+	} else {
+		_private_int_mul_high(q, q, mu, um)                          or_return;
+	}
+
+	/*
+		q3 = q2 / b**(k+1)
+	*/
+	internal_shr_digit(q, um + 1);
+
+	/*
+		x = x mod b**(k+1), quick (no division)
+	*/
+	internal_int_mod_bits(x, x, _DIGIT_BITS * (um + 1))              or_return;
+
+	/*
+		q = q * m mod b**(k+1), quick (no division)
+	*/
+	_private_int_mul(q, q, m, um + 1)                                or_return;
+
+	/*
+		x = x - q
+	*/
+	internal_sub(x, x, q)                                            or_return;
+
+	/*
+		If x < 0, add b**(k+1) to it.
+	*/
+	if internal_cmp(x, 0) == -1 {
+		internal_set(q, 1)                                           or_return;
+		internal_shl_digit(q, um + 1)                                or_return;
+		internal_add(x, x, q)                                        or_return;
+	}
+
+	/*
+		Back off if it's too big.
+	*/
+	for internal_cmp(x, m) != -1 {
+		internal_sub(x, x, m)                                        or_return;
+	}
+
+	return nil;
+}
+
+/*
+	Reduces `a` modulo `n`, where `n` is of the form 2**p - d.
+*/
+_private_int_reduce_2k :: proc(a, n: ^Int, d: DIGIT, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	q := &Int{};
+	defer internal_destroy(q);
+
+	internal_zero(q)                                                 or_return;
+
+	p := internal_count_bits(n);
+
+	for {
+		/*
+			q = a/2**p, a = a mod 2**p
+		*/
+		internal_shrmod(q, a, a, p)                                  or_return;
+
+		if d != 1 {
+			/*
+				q = q * d
+			*/
+			internal_mul(q, q, d)                                    or_return;
+		}
+
+		/*
+			a = a + q
+		*/
+		internal_add(a, a, q)                                        or_return;
+		if internal_cmp_mag(a, n) == -1                              { break; }
+		internal_sub(a, a, n)                                        or_return;
+	}
+
+	return nil;
+}
+
+/*
+	Reduces `a` modulo `n` where `n` is of the form 2**p - d
+	This differs from reduce_2k since "d" can be larger than a single digit.
+*/
+_private_int_reduce_2k_l :: proc(a, n, d: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	q := &Int{};
+	defer internal_destroy(q);
+
+	internal_zero(q)                                                 or_return;
+
+	p := internal_count_bits(n);
+
+	for {
+		/*
+			q = a/2**p, a = a mod 2**p
+		*/
+		internal_shrmod(q, a, a, p)                                  or_return;
+
+		/*
+			q = q * d
+		*/
+		internal_mul(q, q, d)                                        or_return;
+
+		/*
+			a = a + q
+		*/
+		internal_add(a, a, q)                                        or_return;
+		if internal_cmp_mag(a, n) == -1                              { break; }
+		internal_sub(a, a, n)                                        or_return;
+	}
+
+	return nil;
+}
+
+/*
+	Determines if `internal_int_reduce_2k` can be used.
+	Asssumes `a` not to be `nil` and to have been initialized.
+*/
+_private_int_reduce_is_2k :: proc(a: ^Int) -> (reducible: bool, err: Error) {
+	assert_if_nil(a);
+
+	if internal_is_zero(a) {
+		return false, nil;
+	} else if a.used == 1 {
+		return true, nil;
+	} else if a.used  > 1 {
+		iy := internal_count_bits(a);
+		iw := 1;
+		iz := DIGIT(1);
+
+		/*
+			Test every bit from the second digit up, must be 1.
+		*/
+		for ix := _DIGIT_BITS; ix < iy; ix += 1 {
+			if a.digit[iw] & iz == 0 {
+				return false, nil;
+			}
+
+			iz <<= 1;
+			if iz > _DIGIT_MAX {
+				iw += 1;
+				iz  = 1;
+			}
+		}
+		return true, nil;
+	} else {
+		return true, nil;
+	}
+}
+
+/*
+	Determines if `internal_int_reduce_2k_l` can be used.
+	Asssumes `a` not to be `nil` and to have been initialized.
+*/
+_private_int_reduce_is_2k_l :: proc(a: ^Int) -> (reducible: bool, err: Error) {
+	assert_if_nil(a);
+
+	if internal_int_is_zero(a) {
+		return false, nil;
+	} else if a.used == 1 {
+		return true, nil;
+	} else if a.used  > 1 {
+		/*
+			If more than half of the digits are -1 we're sold.
+		*/
+		ix := 0;
+		iy := 0;
+
+		for ; ix < a.used; ix += 1 {
+			if a.digit[ix] == _DIGIT_MAX {
+				iy += 1;
+			}
+		}
+		return iy >= (a.used / 2), nil;
+	} else {
+		return false, nil;
+	}
+}
+
+/*
+	Determines the setup value.
+	Assumes `a` is not `nil`.
+*/
+_private_int_reduce_2k_setup :: proc(a: ^Int, allocator := context.allocator) -> (d: DIGIT, err: Error) {
+	context.allocator = allocator;
+
+	tmp := &Int{};
+	defer internal_destroy(tmp);
+	internal_zero(tmp)                                               or_return;
+
+	internal_int_power_of_two(tmp, internal_count_bits(a))           or_return;
+	internal_sub(tmp, tmp, a)                                        or_return;
+
+	return tmp.digit[0], nil;
+}
+
+/*
+	Determines the setup value.
+	Assumes `mu` and `P` are not `nil`.
+
+	d := (1 << a.bits) - a;
+*/
+_private_int_reduce_2k_setup_l :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	tmp := &Int{};
+	defer internal_destroy(tmp);
+	internal_zero(tmp)                                               or_return;
+
+	internal_int_power_of_two(tmp, internal_count_bits(P))           or_return;
+	internal_sub(mu, tmp, P)                                         or_return;
+
+	return nil;
+}
+
+/*
+	Pre-calculate the value required for Barrett reduction.
+	For a given modulus "P" it calulates the value required in "mu"
+	Assumes `mu` and `P` are not `nil`.
+*/
+_private_int_reduce_setup :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	internal_int_power_of_two(mu, P.used * 2 * _DIGIT_BITS)           or_return;
+	return internal_int_div(mu, mu, P);
+}
+
+/*
+	Determines the setup value.
+	Assumes `a` to not be `nil` and to have been initialized.
+*/
+_private_int_dr_setup :: proc(a: ^Int) -> (d: DIGIT) {
+	/*
+		The casts are required if _DIGIT_BITS is one less than
+		the number of bits in a DIGIT [e.g. _DIGIT_BITS==31].
+	*/
+	return DIGIT((1 << _DIGIT_BITS) - a.digit[0]);
+}
+
+/*
+	Determines if a number is a valid DR modulus.
+	Assumes `a` to not be `nil` and to have been initialized.
+*/
+_private_dr_is_modulus :: proc(a: ^Int) -> (res: bool) {
+	/*
+		Must be at least two digits.
+	*/
+	if a.used < 2 { return false; }
+
+	/*
+		Must be of the form b**k - a [a <= b] so all but the first digit must be equal to -1 (mod b).
+	*/
+	for ix := 1; ix < a.used; ix += 1 {
+		if a.digit[ix] != _MASK {
+			return false;
+		}
+	}
+	return true;
+}
+
+/*
+	Reduce "x" in place modulo "n" using the Diminished Radix algorithm.
+	Based on algorithm from the paper
+
+		"Generating Efficient Primes for Discrete Log Cryptosystems"
+					Chae Hoon Lim, Pil Joong Lee,
+			POSTECH Information Research Laboratories
+
+	The modulus must be of a special format [see manual].
+	Has been modified to use algorithm 7.10 from the LTM book instead
+
+	Input x must be in the range 0 <= x <= (n-1)**2
+	Assumes `x` and `n` to not be `nil` and to have been initialized.
+*/
+_private_int_dr_reduce :: proc(x, n: ^Int, k: DIGIT, allocator := context.allocator) -> (err: Error) {
+	/*
+		m = digits in modulus.
+	*/
+	m := n.used;
+
+	/*
+		Ensure that "x" has at least 2m digits.
+	*/
+	internal_grow(x, m + m)                                          or_return;
+
+	/*
+		Top of loop, this is where the code resumes if another reduction pass is required.
+	*/
+	for {
+		i: int;
+		mu := DIGIT(0);
+
+		/*
+			Compute (x mod B**m) + k * [x/B**m] inline and inplace.
+		*/
+		for i = 0; i < m; i += 1 {
+			r         := _WORD(x.digit[i + m]) * _WORD(k) + _WORD(x.digit[i] + mu);
+			x.digit[i] = DIGIT(r & _WORD(_MASK));
+			mu         = DIGIT(r >> _WORD(_DIGIT_BITS));
+		}
+
+		/*
+			Set final carry.
+		*/
+		x.digit[i] = mu;
+
+		/*
+			Zero words above m.
+		*/
+		mem.zero_slice(x.digit[m + 1:][:x.used - m]);
+
+		/*
+			Clamp, sub and return.
+		*/
+		internal_clamp(x)                                            or_return;
+
+		/*
+			If x >= n then subtract and reduce again.
+			Each successive "recursion" makes the input smaller and smaller.
+		*/
+		if internal_cmp_mag(x, n) == -1 { break; }
+
+		internal_sub(x, x, n)                                        or_return;
+	}
+	return nil;
+}
+
+/*
+	Computes res == G**X mod P.
+	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
+*/
+_private_int_exponent_mod :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	M := [_TAB_SIZE]Int{};
+	winsize: uint;
+
+	/*
+		Use a pointer to the reduction algorithm.
+		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
+	*/
+	redux: #type proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error);
+
+	defer {
+		internal_destroy(&M[1]);
+		for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+			internal_destroy(&M[x]);
+		}
+	}
+
+	/*
+		Find window size.
+	*/
+	x := internal_count_bits(X);
+	switch {
+	case x <= 7:
+		winsize = 2;
+	case x <= 36:
+		winsize = 3;
+	case x <= 140:
+		winsize = 4;
+	case x <= 450:
+		winsize = 5;
+	case x <= 1303:
+		winsize = 6;
+	case x <= 3529:
+		winsize = 7;
+	case:
+		winsize = 8;
+	}
+
+	winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize;
+
+	/*
+		Init M array.
+		Init first cell.
+	*/
+	internal_zero(&M[1])                                             or_return;
+
+	/*
+		Now init the second half of the array.
+	*/
+	for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+		internal_zero(&M[x])                                         or_return;
+	}
+
+	/*
+		Create `mu`, used for Barrett reduction.
+	*/
+	mu := &Int{};
+	defer internal_destroy(mu);
+	internal_zero(mu)                                                or_return;
+
+	if redmode == 0 {
+		_private_int_reduce_setup(mu, P)                             or_return;
+		redux = _private_int_reduce;
+	} else {
+		_private_int_reduce_2k_setup_l(mu, P)                        or_return;
+		redux = _private_int_reduce_2k_l;
+	}
+
+	/*
+		Create M table.
+
+		The M table contains powers of the base, e.g. M[x] = G**x mod P.
+		The first half of the table is not computed, though, except for M[0] and M[1].
+	*/
+	internal_int_mod(&M[1], G, P)                                    or_return;
+
+	/*
+		Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
+
+		TODO: This can probably be replaced by computing the power and using `pow` to raise to it
+		instead of repeated squaring.
+	*/
+	slot := 1 << (winsize - 1);
+	internal_copy(&M[slot], &M[1])                                   or_return;
+
+	for x = 0; x < int(winsize - 1); x += 1 {
+		/*
+			Square it.
+		*/
+		internal_sqr(&M[slot], &M[slot])                             or_return;
+
+		/*
+			Reduce modulo P
+		*/
+		redux(&M[slot], P, mu)                                       or_return;
+	}
+
+	/*
+		Create upper table, that is M[x] = M[x-1] * M[1] (mod P)
+		for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
+	*/
+	for x = slot + 1; x < (1 << winsize); x += 1 {
+		internal_mul(&M[x], &M[x - 1], &M[1])                        or_return;
+		redux(&M[x], P, mu)                                          or_return;
+	}
+
+	/*
+		Setup result.
+	*/
+	internal_one(res)                                                or_return;
+
+	/*
+		Set initial mode and bit cnt.
+	*/
+	mode   := 0;
+	bitcnt := 1;
+	buf    := DIGIT(0);
+	digidx := X.used - 1;
+	bitcpy := uint(0);
+	bitbuf := DIGIT(0);
+
+	for {
+		/*
+			Grab next digit as required.
+		*/
+		bitcnt -= 1;
+		if bitcnt == 0 {
+			/*
+				If digidx == -1 we are out of digits.
+			*/
+			if digidx == -1 { break; }
+
+			/*
+				Read next digit and reset the bitcnt.
+			*/
+			buf    = X.digit[digidx];
+			digidx -= 1;
+			bitcnt = _DIGIT_BITS;
+		}
+
+		/*
+			Grab the next msb from the exponent.
+		*/
+		y := buf >> (_DIGIT_BITS - 1) & 1;
+		buf <<= 1;
+
+		/*
+			If the bit is zero and mode == 0 then we ignore it.
+			These represent the leading zero bits before the first 1 bit
+			in the exponent.  Technically this opt is not required but it
+			does lower the # of trivial squaring/reductions used.
+		*/
+		if mode == 0 && y == 0 {
+			continue;
+		}
+
+		/*
+			If the bit is zero and mode == 1 then we square.
+		*/
+		if mode == 1 && y == 0 {
+			internal_sqr(res, res)                                   or_return;
+			redux(res, P, mu)                                        or_return;
+			continue;
+		}
+
+		/*
+			Else we add it to the window.
+		*/
+		bitcpy += 1;
+		bitbuf |= (y << (winsize - bitcpy));
+		mode    = 2;
+
+		if (bitcpy == winsize) {
+			/*
+				Window is filled so square as required and multiply.
+				Square first.
+			*/
+			for x = 0; x < int(winsize); x += 1 {
+				internal_sqr(res, res)                               or_return;
+				redux(res, P, mu)                                    or_return;
+			}
+
+			/*
+				Then multiply.
+			*/
+			internal_mul(res, res, &M[bitbuf])                       or_return;
+			redux(res, P, mu)                                        or_return;
+
+			/*
+				Empty window and reset.
+			*/
+			bitcpy = 0;
+			bitbuf = 0;
+			mode   = 1;
+		}
+	}
+
+	/*
+		If bits remain then square/multiply.
+	*/
+	if mode == 2 && bitcpy > 0 {
+		/*
+			Square then multiply if the bit is set.
+		*/
+		for x = 0; x < int(bitcpy); x += 1 {
+			internal_sqr(res, res)                                   or_return;
+			redux(res, P, mu)                                        or_return;
+
+			bitbuf <<= 1;
+			if ((bitbuf & (1 << winsize)) != 0) {
+				/*
+					Then multiply.
+				*/
+				internal_mul(res, res, &M[1])                        or_return;
+				redux(res, P, mu)                                    or_return;
+			}
+		}
+	}
+	return err;
+}
+
+/*
+	Computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
+
+	Uses a left-to-right `k`-ary sliding window to compute the modular exponentiation.
+	The value of `k` changes based on the size of the exponent.
+
+	Uses Montgomery or Diminished Radix reduction [whichever appropriate]
+
+	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
+*/
+_private_int_exponent_mod_fast :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	M := [_TAB_SIZE]Int{};
+	winsize: uint;
+
+	/*
+		Use a pointer to the reduction algorithm.
+		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
+	*/
+	redux: #type proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error);
+
+	defer {
+		internal_destroy(&M[1]);
+		for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+			internal_destroy(&M[x]);
+		}
+	}
+
+	/*
+		Find window size.
+	*/
+	x := internal_count_bits(X);
+	switch {
+	case x <= 7:
+		winsize = 2;
+	case x <= 36:
+		winsize = 3;
+	case x <= 140:
+		winsize = 4;
+	case x <= 450:
+		winsize = 5;
+	case x <= 1303:
+		winsize = 6;
+	case x <= 3529:
+		winsize = 7;
+	case:
+		winsize = 8;
+	}
+
+	winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize;
+
+	/*
+		Init M array
+		Init first cell.
+	*/
+	cap := internal_int_allocated_cap(P);
+	internal_grow(&M[1], cap)                                        or_return;
+
+	/*
+		Now init the second half of the array.
+	*/
+	for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+		internal_grow(&M[x], cap)                                    or_return;
+	}
+
+	/*
+		Determine and setup reduction code.
+	*/
+	rho: DIGIT;
+
+	if redmode == 0 {
+		/*
+			Now setup Montgomery.
+		*/
+		rho = _private_int_montgomery_setup(P)                       or_return;
+
+		/*
+			Automatically pick the comba one if available (saves quite a few calls/ifs).
+		*/
+		if ((P.used * 2) + 1) < _WARRAY && P.used < _MAX_COMBA {
+			redux = _private_montgomery_reduce_comba;
+		} else {
+			/*
+				Use slower baseline Montgomery method.
+			*/
+			redux = _private_int_montgomery_reduce;
+		}
+	} else if redmode == 1 {
+		/*
+			Setup DR reduction for moduli of the form B**k - b.
+		*/
+		rho = _private_int_dr_setup(P);
+		redux = _private_int_dr_reduce;
+	} else {
+		/*
+			Setup DR reduction for moduli of the form 2**k - b.
+		*/
+		rho = _private_int_reduce_2k_setup(P)                        or_return;
+		redux = _private_int_reduce_2k;
+	}
+
+	/*
+		Setup result.
+	*/
+	internal_grow(res, cap)                                          or_return;
+
+	/*
+		Create M table
+		The first half of the table is not computed, though, except for M[0] and M[1]
+	*/
+
+	if redmode == 0 {
+		/*
+			Now we need R mod m.
+		*/
+		_private_int_montgomery_calc_normalization(res, P)           or_return;
+
+		/*
+			Now set M[1] to G * R mod m.
+		*/
+		internal_mulmod(&M[1], G, res, P)                            or_return;
+	} else {
+		internal_one(res)                                            or_return;
+		internal_mod(&M[1], G, P)                                    or_return;
+	}
+
+	/*
+		Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
+	*/
+	slot := 1 << (winsize - 1);
+	internal_copy(&M[slot], &M[1])                                   or_return;
+
+	for x = 0; x < int(winsize - 1); x += 1 {
+		internal_sqr(&M[slot], &M[slot])                             or_return;
+		redux(&M[slot], P, rho)                                      or_return;
+	}
+
+	/*
+		Create upper table.
+	*/
+	for x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x += 1 {
+		internal_mul(&M[x], &M[x - 1], &M[1])                        or_return;
+		redux(&M[x], P, rho)                                         or_return;
+	}
+
+	/*
+		Set initial mode and bit cnt.
+	*/
+	mode   := 0;
+	bitcnt := 1;
+	buf    := DIGIT(0);
+	digidx := X.used - 1;
+	bitcpy := 0;
+	bitbuf := DIGIT(0);
+
+	for {
+		/*
+			Grab next digit as required.
+		*/
+		bitcnt -= 1;
+		if bitcnt == 0 {
+			/*
+				If digidx == -1 we are out of digits so break.
+			*/
+			if digidx == -1 { break; }
+
+			/*
+				Read next digit and reset the bitcnt.
+			*/
+			buf    = X.digit[digidx];
+			digidx -= 1;
+			bitcnt = _DIGIT_BITS;
+		}
+
+		/*
+			Grab the next msb from the exponent.
+		*/
+		y := (buf >> (_DIGIT_BITS - 1)) & 1;
+		buf <<= 1;
+
+		/*
+			If the bit is zero and mode == 0 then we ignore it.
+			These represent the leading zero bits before the first 1 bit in the exponent.
+			Technically this opt is not required but it does lower the # of trivial squaring/reductions used.
+		*/
+		if mode == 0 && y == 0 { continue; }
+
+		/*
+			If the bit is zero and mode == 1 then we square.
+		*/
+		if mode == 1 && y == 0 {
+			internal_sqr(res, res)                                   or_return;
+			redux(res, P, rho)                                       or_return;
+			continue;
+		}
+
+		/*
+			Else we add it to the window.
+		*/
+		bitcpy += 1;
+		bitbuf |= (y << (winsize - uint(bitcpy)));
+		mode    = 2;
+
+		if bitcpy == int(winsize) {
+			/*
+				Window is filled so square as required and multiply
+				Square first.
+			*/
+			for x = 0; x < int(winsize); x += 1 {
+				internal_sqr(res, res)                               or_return;
+				redux(res, P, rho)                                   or_return;
+			}
+
+			/*
+				Then multiply.
+			*/
+			internal_mul(res, res, &M[bitbuf])                       or_return;
+			redux(res, P, rho)                                       or_return;
+
+			/*
+				Empty window and reset.
+			*/
+			bitcpy = 0;
+			bitbuf = 0;
+			mode   = 1;
+		}
+	}
+
+	/*
+		If bits remain then square/multiply.
+	*/
+	if mode == 2 && bitcpy > 0 {
+		/*
+			Square then multiply if the bit is set.
+		*/
+		for x = 0; x < bitcpy; x += 1 {
+			internal_sqr(res, res)                                   or_return;
+			redux(res, P, rho)                                       or_return;
+
+			/*
+				Get next bit of the window.
+			*/
+			bitbuf <<= 1;
+			if bitbuf & (1 << winsize) != 0 {
+				/*
+					Then multiply.
+				*/
+				internal_mul(res, res, &M[1])                        or_return;
+				redux(res, P, rho)                                   or_return;
+			}
+		}
+	}
+
+	if redmode == 0 {
+		/*
+			Fixup result if Montgomery reduction is used.
+			Recall that any value in a Montgomery system is actually multiplied by R mod n.
+			So we have to reduce one more time to cancel out the factor of R.
+		*/
+		redux(res, P, rho)                                           or_return;
+	}
+
+	return nil;
+}
+
 /*
 	hac 14.61, pp608
 */

From 7d7ed6b95f50ec36e6ed5ad6bded466ca75de26e Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Wed, 1 Sep 2021 12:33:33 +0200
Subject: [PATCH 09/33] big: Add `internal_int_exponent_mod`.

---
 core/math/big/example.odin |   2 +-
 core/math/big/prime.odin   | 127 ++++++++++++++++++-------------------
 2 files changed, 63 insertions(+), 66 deletions(-)

diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index de0aed468..64ac30424 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -218,7 +218,7 @@ demo :: proc() {
 	set(b, 6);
 	set(c, 131);
 
-	if err := _private_int_exponent_mod(res, a, b, c, 1); err != nil {
+	if err := internal_int_exponent_mod(res, a, b, c); err != nil {
 		fmt.printf("Error: %v\n", err);
 	}
 
diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index edd6d9f3d..85a7b931f 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -43,73 +43,70 @@ internal_int_prime_is_divisible :: proc(a: ^Int, allocator := context.allocator)
 internal_int_exponent_mod :: proc(res, G, X, P: ^Int, allocator := context.allocator) -> (err: Error) {
 	context.allocator = allocator;
 
-/*
-	int dr;
-
-	/* modulus P must be positive */
-	if (mp_isneg(P)) {
-		return MP_VAL;
-	}
-
-	/* if exponent X is negative we have to recurse */
-	if (mp_isneg(X)) {
-		mp_int tmpG, tmpX;
-		mp_err err;
-
-		if (!MP_HAS(MP_INVMOD)) {
-			return MP_VAL;
-		}
-
-		if ((err = mp_init_multi(&tmpG, &tmpX, NULL)) != MP_OKAY) {
-			return err;
-		}
-
-		/* first compute 1/G mod P */
-		if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) {
-			goto LBL_ERR;
-		}
-
-		/* now get |X| */
-		if ((err = mp_abs(X, &tmpX)) != MP_OKAY) {
-			goto LBL_ERR;
-		}
-
-		/* and now compute (1/G)**|X| instead of G**X [X < 0] */
-		err = mp_exptmod(&tmpG, &tmpX, P, Y);
-LBL_ERR:
-		mp_clear_multi(&tmpG, &tmpX, NULL);
-		return err;
-	}
-
-	/* modified diminished radix reduction */
-	if (MP_HAS(MP_REDUCE_IS_2K_L) && MP_HAS(MP_REDUCE_2K_L) && MP_HAS(S_MP_EXPTMOD) &&
-		 mp_reduce_is_2k_l(P)) {
-		return s_mp_exptmod(G, X, P, Y, 1);
-	}
-
-	/* is it a DR modulus? default to no */
-	dr = (MP_HAS(MP_DR_IS_MODULUS) && mp_dr_is_modulus(P)) ? 1 : 0;
-
-	/* if not, is it a unrestricted DR modulus? */
-	if (MP_HAS(MP_REDUCE_IS_2K) && (dr == 0)) {
-		dr = (mp_reduce_is_2k(P)) ? 2 : 0;
-	}
-
-	/* if the modulus is odd or dr != 0 use the montgomery method */
-	if (MP_HAS(S_MP_EXPTMOD_FAST) && (mp_isodd(P) || (dr != 0))) {
-		return s_mp_exptmod_fast(G, X, P, Y, dr);
-	}
-
-	/* otherwise use the generic Barrett reduction technique */
-	if (MP_HAS(S_MP_EXPTMOD)) {
-		return s_mp_exptmod(G, X, P, Y, 0);
-	}
-
-	/* no exptmod for evens */
-	return MP_VAL;
+	dr: int;
 
+	/*
+		Modulus P must be positive.
 	*/
-	return nil;
+	if internal_is_negative(P) { return .Invalid_Argument; }
+
+	/*
+		If exponent X is negative we have to recurse.
+	*/
+	if internal_is_negative(X) {
+		tmpG, tmpX := &Int{}, &Int{};
+		defer internal_destroy(tmpG, tmpX);
+
+		internal_init_multi(tmpG, tmpX) or_return;
+
+		/*
+			First compute 1/G mod P.
+		*/
+		internal_invmod(tmpG, G, P) or_return;
+
+		/*
+			now get |X|.
+		*/
+		internal_abs(tmpX, X) or_return;
+
+		/*
+			And now compute (1/G)**|X| instead of G**X [X < 0].
+		*/
+		return internal_int_exponent_mod(res, tmpG, tmpX, P);
+	}
+
+	/*
+		Modified diminished radix reduction.
+	*/
+	can_reduce_2k_l := _private_int_reduce_is_2k_l(P) or_return;
+	if can_reduce_2k_l {
+		return _private_int_exponent_mod(res, G, X, P, 1);
+	}
+
+	/*
+		Is it a DR modulus? default to no.
+	*/
+	dr = 1 if _private_dr_is_modulus(P) else 0;
+
+	/*
+		If not, is it a unrestricted DR modulus?
+	*/
+	if dr == 0 {
+		reduce_is_2k := _private_int_reduce_is_2k(P) or_return;
+		dr = 2 if reduce_is_2k else 0;
+	}
+
+	/*
+		If the modulus is odd or dr != 0 use the montgomery method.
+	*/
+	if internal_int_is_odd(P) || dr != 0 {
+		return _private_int_exponent_mod(res, G, X, P, dr);
+	}
+
+	/*
+		Otherwise use the generic Barrett reduction technique.
+	*/
+	return _private_int_exponent_mod(res, G, X, P, 0);
 }
 
 /*

From fd83cbf40bad98e7d50d75bee2cdf4a5d3d54921 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Wed, 1 Sep 2021 14:36:15 +0200
Subject: [PATCH 10/33] big: Add `ilog2`.

---
 core/math/big/example.odin | 10 ----------
 core/math/big/private.odin |  2 +-
 core/math/big/public.odin  |  6 ++++++
 3 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index 64ac30424..449a851d1 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -213,16 +213,6 @@ int_to_byte_little :: proc(v: ^Int) {
 demo :: proc() {
 	a, b, c, d, e, f, res := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
 	defer destroy(a, b, c, d, e, f, res);
-
-	set(a, 42);
-	set(b, 6);
-	set(c, 131);
-
-	if err := internal_int_exponent_mod(res, a, b, c); err != nil {
-		fmt.printf("Error: %v\n", err);
-	}
-
-	print("res: ", res);
 }
 
 main :: proc() {
diff --git a/core/math/big/private.odin b/core/math/big/private.odin
index d2878fcc1..72f5bb9e2 100644
--- a/core/math/big/private.odin
+++ b/core/math/big/private.odin
@@ -1437,7 +1437,7 @@ _private_int_factorial_binary_split :: proc(res: ^Int, n: int, allocator := cont
 	internal_one(inner, false)                                       or_return;
 	internal_one(outer, false)                                       or_return;
 
-	bits_used := int(_DIGIT_TYPE_BITS - intrinsics.count_leading_zeros(n));
+	bits_used := ilog2(n);
 
 	for i := bits_used; i >= 0; i -= 1 {
 		start := (n >> (uint(i) + 1)) + 1 | 1;
diff --git a/core/math/big/public.odin b/core/math/big/public.odin
index d69b3ba22..68c574905 100644
--- a/core/math/big/public.odin
+++ b/core/math/big/public.odin
@@ -10,6 +10,8 @@
 */
 package math_big
 
+import "core:intrinsics"
+
 /*
 	===========================
 		User-level routines    
@@ -383,6 +385,10 @@ digit_log :: proc(a: DIGIT, base: DIGIT) -> (log: int, err: Error) {
 }
 log :: proc { int_log, digit_log, };
 
+ilog2 :: proc(value: $T) -> (log2: T) {
+	return (size_of(T) * 8) - intrinsics.count_leading_zeros(value);
+}
+
 /*
 	Calculate `dest = base^power` using a square-multiply algorithm.
 */

From df29d1021058624664ea8ec8fac0d01d9d45b97a Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Wed, 1 Sep 2021 15:57:08 +0200
Subject: [PATCH 11/33] big: Add `internal_int_kronecker`.

---
 core/math/big/example.odin | 129 -------------------------------------
 core/math/big/prime.odin   |  95 +++++++++++++++++++++++++++
 2 files changed, 95 insertions(+), 129 deletions(-)

diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index 449a851d1..4da2ebbe9 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -79,135 +79,6 @@ print :: proc(name: string, a: ^Int, base := i8(10), print_name := true, newline
 	}
 }
 
-int_to_byte :: proc(v: ^Int) {
-	err: Error;
-	size: int;
-	print("v: ", v);
-	fmt.println();
-
-	t := &Int{};
-	defer destroy(t);
-
-	if size, err = int_to_bytes_size(v); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err);
-		return;
-	}
-	b1 := make([]u8, size, context.temp_allocator);
-	err = int_to_bytes_big(v, b1);
-	int_from_bytes_big(t, b1);
-	fmt.printf("big: %v | err: %v\n", b1, err);
-
-	int_from_bytes_big(t, b1);
-	if internal_cmp_mag(t, v) != 0 {
-		print("\tError parsing t: ", t);
-	}
-
-	if size, err = int_to_bytes_size(v); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err);
-		return;
-	}
-	b2 := make([]u8, size, context.temp_allocator);
-	err = int_to_bytes_big_python(v, b2);
-	fmt.printf("big python: %v | err: %v\n", b2, err);
-
-	if err == nil {
-		int_from_bytes_big_python(t, b2);
-		if internal_cmp_mag(t, v) != 0 {
-			print("\tError parsing t: ", t);
-		}
-	}
-
-	if size, err = int_to_bytes_size(v, true); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err);
-		return;
-	}
-	b3 := make([]u8, size, context.temp_allocator);
-	err = int_to_bytes_big(v, b3, true);
-	fmt.printf("big signed: %v | err: %v\n", b3, err);
-
-	int_from_bytes_big(t, b3, true);
-	if internal_cmp(t, v) != 0 {
-		print("\tError parsing t: ", t);
-	}
-
-	if size, err = int_to_bytes_size(v, true); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err);
-		return;
-	}
-	b4 := make([]u8, size, context.temp_allocator);
-	err = int_to_bytes_big_python(v, b4, true);
-	fmt.printf("big signed python: %v | err: %v\n", b4, err);
-
-	int_from_bytes_big_python(t, b4, true);
-	if internal_cmp(t, v) != 0 {
-		print("\tError parsing t: ", t);
-	}
-}
-
-int_to_byte_little :: proc(v: ^Int) {
-	err: Error;
-	size: int;
-	print("v: ", v);
-	fmt.println();
-
-	t := &Int{};
-	defer destroy(t);
-
-	if size, err = int_to_bytes_size(v); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err);
-		return;
-	}
-	b1 := make([]u8, size, context.temp_allocator);
-	err = int_to_bytes_little(v, b1);
-	fmt.printf("little: %v | err: %v\n", b1, err);
-
-	int_from_bytes_little(t, b1);
-	if internal_cmp_mag(t, v) != 0 {
-		print("\tError parsing t: ", t);
-	}
-
-	if size, err = int_to_bytes_size(v); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err);
-		return;
-	}
-	b2 := make([]u8, size, context.temp_allocator);
-	err = int_to_bytes_little_python(v, b2);
-	fmt.printf("little python: %v | err: %v\n", b2, err);
-
-	if err == nil {
-		int_from_bytes_little_python(t, b2);
-		if internal_cmp_mag(t, v) != 0 {
-			print("\tError parsing t: ", t);
-		}
-	}
-
-	if size, err = int_to_bytes_size(v, true); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err);
-		return;
-	}
-	b3 := make([]u8, size, context.temp_allocator);
-	err = int_to_bytes_little(v, b3, true);
-	fmt.printf("little signed: %v | err: %v\n", b3, err);
-
-	int_from_bytes_little(t, b3, true);
-	if internal_cmp(t, v) != 0 {
-		print("\tError parsing t: ", t);
-	}
-
-	if size, err = int_to_bytes_size(v, true); err != nil {
-		fmt.printf("int_to_bytes_size returned: %v\n", err);
-		return;
-	}
-	b4 := make([]u8, size, context.temp_allocator);
-	err = int_to_bytes_little_python(v, b4, true);
-	fmt.printf("little signed python: %v | err: %v\n", b4, err);
-
-	int_from_bytes_little_python(t, b4, true);
-	if internal_cmp(t, v) != 0 {
-		print("\tError parsing t: ", t);
-	}
-}
-
 // printf :: fmt.printf;
 
 demo :: proc() {
diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index 85a7b931f..182e2bffa 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -109,6 +109,101 @@ internal_int_exponent_mod :: proc(res, G, X, P: ^Int, allocator := context.alloc
 	return _private_int_exponent_mod(res, G, X, P, 0);
 }
 
+/*
+	Kronecker symbol (a|p)
+	Straightforward implementation of algorithm 1.4.10 in
+	Henri Cohen: "A Course in Computational Algebraic Number Theory"
+
+	@book{cohen2013course,
+		title={A course in computational algebraic number theory},
+		author={Cohen, Henri},
+		volume={138},
+		year={2013},
+		publisher={Springer Science \& Business Media}
+	}
+
+	Assumes `a` and `p` to not be `nil` and to have been initialized.
+*/
+internal_int_kronecker :: proc(a, p: ^Int, allocator := context.allocator) -> (kronecker: int, err: Error) {
+	context.allocator = allocator;
+
+	a1, p1, r := &Int{}, &Int{}, &Int{};
+	defer internal_destroy(a1, p1, r);
+
+	table := []int{0, 1, 0, -1, 0, -1, 0, 1};
+
+	if internal_int_is_zero(p) {
+		if a.used == 1 && a.digit[0] == 1 {
+			return 1, nil;
+		} else {
+			return 0, nil;
+		}
+	}
+
+	if internal_is_even(a) && internal_is_even(p) {
+		return 0, nil;
+	}
+
+	internal_copy(a1, a) or_return;
+	internal_copy(p1, p) or_return;
+
+	v := internal_count_lsb(p1) or_return;
+	internal_shr(p1, p1, v) or_return;
+
+	k := 1 if v & 1 == 0 else table[a.digit[0] & 7];
+
+	if internal_is_negative(p1) {
+		p1.sign = .Zero_or_Positive;
+		if internal_is_negative(a1) {
+			k = -k;
+		}
+	}
+
+	internal_zero(r) or_return;
+
+	for {
+		if internal_is_zero(a1) {
+			if internal_cmp(p1, 1) == 0 {
+				return k, nil;
+			} else {
+				return 0, nil;
+			}
+		}
+
+		v = internal_count_lsb(a1) or_return;
+		internal_shr(a1, a1, v) or_return;
+
+		if v & 1 == 1 {
+			k = k * table[p1.digit[0] & 7];
+		}
+
+		if internal_is_negative(a1) {
+			/*
+				Compute k = (-1)^((a1)*(p1-1)/4) * k.
+				a1.digit[0] + 1 cannot overflow because the MSB
+				of the DIGIT type is not set by definition.
+			 */
+			if a1.digit[0] + 1 & p1.digit[0] & 2 != 0 {
+				k = -k;
+			}
+		} else {
+			/*
+				Compute k = (-1)^((a1-1)*(p1-1)/4) * k.
+			*/
+			if a1.digit[0] & p1.digit[0] & 2 != 0 {
+				k = -k;
+			}
+		}
+
+		internal_copy(r, a1) or_return;
+		r.sign = .Zero_or_Positive;
+
+		internal_mod(a1, p1, r) or_return;
+		internal_copy(p1, r)    or_return;
+	}
+	return;
+}
+
 /*
 	Returns the number of Rabin-Miller trials needed for a given bit size.
 */

From 335d361fc6b878f283423b74babab3f5e13b847c Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Wed, 1 Sep 2021 18:00:00 +0200
Subject: [PATCH 12/33] big: Add comparison helpers.

---
 core/math/big/internal.odin | 174 +++++++++++++++++++++
 core/math/big/public.odin   | 292 ++++++++++++++++++++++++++++++++++++
 2 files changed, 466 insertions(+)

diff --git a/core/math/big/internal.odin b/core/math/big/internal.odin
index ca113275c..aa62569e7 100644
--- a/core/math/big/internal.odin
+++ b/core/math/big/internal.odin
@@ -1098,6 +1098,7 @@ internal_is_power_of_two :: proc { internal_int_is_power_of_two, };
 	Expects `a` and `b` both to be valid `Int`s, i.e. initialized and not `nil`.
 */
 internal_int_compare :: #force_inline proc(a, b: ^Int) -> (comparison: int) {
+	assert_if_nil(a, b);
 	a_is_negative := #force_inline internal_is_negative(a);
 
 	/*
@@ -1121,6 +1122,7 @@ internal_cmp :: internal_compare;
 	Expects: `a` and `b` both to be valid `Int`s, i.e. initialized and not `nil`.
 */
 internal_int_compare_digit :: #force_inline proc(a: ^Int, b: DIGIT) -> (comparison: int) {
+	assert_if_nil(a);
 	a_is_negative := #force_inline internal_is_negative(a);
 
 	switch {
@@ -1152,6 +1154,7 @@ internal_cmp_digit :: internal_compare_digit;
 	Compare the magnitude of two `Int`s, unsigned.
 */
 internal_int_compare_magnitude :: #force_inline proc(a, b: ^Int) -> (comparison: int) {
+	assert_if_nil(a, b);
 	/*
 		Compare based on used digits.
 	*/
@@ -1179,6 +1182,177 @@ internal_int_compare_magnitude :: #force_inline proc(a, b: ^Int) -> (comparison:
 internal_compare_magnitude :: proc { internal_int_compare_magnitude, };
 internal_cmp_mag :: internal_compare_magnitude;
 
+
+/*
+	bool := a < b
+*/
+internal_int_less_than :: #force_inline proc(a, b: ^Int) -> (less_than: bool) {
+	return internal_cmp(a, b) == -1;
+}
+
+/*
+	bool := a < b
+*/
+internal_int_less_than_digit :: #force_inline proc(a: ^Int, b: DIGIT) -> (less_than: bool) {
+	return internal_cmp_digit(a, b) == -1;
+}
+
+/*
+	bool := |a| < |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+internal_int_less_than_abs :: #force_inline proc(a, b: ^Int) -> (less_than: bool) {
+	return internal_cmp_mag(a, b) == -1;
+}
+
+internal_less_than :: proc {
+	internal_int_less_than,
+	internal_int_less_than_digit,
+}
+internal_lt :: internal_less_than;
+
+internal_less_than_abs :: proc {
+	internal_int_less_than_abs,
+}
+internal_lt_abs :: internal_less_than_abs;
+
+
+/*
+	bool := a <= b
+*/
+internal_int_less_than_or_equal :: #force_inline proc(a, b: ^Int) -> (less_than_or_equal: bool) {
+	return internal_cmp(a, b) <= 0;
+}
+
+/*
+	bool := a <= b
+*/
+internal_int_less_than_or_equal_digit :: #force_inline proc(a: ^Int, b: DIGIT) -> (less_than_or_equal: bool) {
+	return internal_cmp_digit(a, b) <= 0;
+}
+
+/*
+	bool := |a| <= |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+internal_int_less_than_or_equal_abs :: #force_inline proc(a, b: ^Int) -> (less_than_or_equal: bool) {
+	return internal_cmp_mag(a, b) <= 0;
+}
+
+internal_less_than_or_equal :: proc {
+	internal_int_less_than_or_equal,
+	internal_int_less_than_or_equal_digit,
+}
+internal_lteq :: internal_less_than_or_equal;
+
+internal_less_than_or_equal_abs :: proc {
+	internal_int_less_than_or_equal_abs,
+}
+internal_lteq_abs :: internal_less_than_or_equal_abs;
+
+
+/*
+	bool := a == b
+*/
+internal_int_equals :: #force_inline proc(a, b: ^Int) -> (equals: bool) {
+	return internal_cmp(a, b) == 0;
+}
+
+/*
+	bool := a == b
+*/
+internal_int_equals_digit :: #force_inline proc(a: ^Int, b: DIGIT) -> (equals: bool) {
+	return internal_cmp_digit(a, b) == 0;
+}
+
+/*
+	bool := |a| == |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+internal_int_equals_abs :: #force_inline proc(a, b: ^Int) -> (equals: bool) {
+	return internal_cmp_mag(a, b) == 0;
+}
+
+internal_equals :: proc {
+	internal_int_equals,
+	internal_int_equals_digit,
+}
+internal_eq :: internal_equals;
+
+internal_equals_abs :: proc {
+	internal_int_equals_abs,
+}
+internal_eq_abs :: internal_equals_abs;
+
+
+/*
+	bool := a >= b
+*/
+internal_int_greater_than_or_equal :: #force_inline proc(a, b: ^Int) -> (greater_than_or_equal: bool) {
+	return internal_cmp(a, b) >= 0;
+}
+
+/*
+	bool := a >= b
+*/
+internal_int_greater_than_or_equal_digit :: #force_inline proc(a: ^Int, b: DIGIT) -> (greater_than_or_equal: bool) {
+	return internal_cmp_digit(a, b) >= 0;
+}
+
+/*
+	bool := |a| >= |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+internal_int_greater_than_or_equal_abs :: #force_inline proc(a, b: ^Int) -> (greater_than_or_equal: bool) {
+	return internal_cmp_mag(a, b) >= 0;
+}
+
+internal_greater_than_or_equal :: proc {
+	internal_int_greater_than_or_equal,
+	internal_int_greater_than_or_equal_digit,
+}
+internal_gteq :: internal_greater_than_or_equal;
+
+internal_greater_than_or_equal_abs :: proc {
+	internal_int_greater_than_or_equal_abs,
+}
+internal_gteq_abs :: internal_greater_than_or_equal_abs;
+
+
+/*
+	bool := a > b
+*/
+internal_int_greater_than :: #force_inline proc(a, b: ^Int) -> (greater_than: bool) {
+	return internal_cmp(a, b) == 1;
+}
+
+/*
+	bool := a > b
+*/
+internal_int_greater_than_digit :: #force_inline proc(a: ^Int, b: DIGIT) -> (greater_than: bool) {
+	return internal_cmp_digit(a, b) == 1;
+}
+
+/*
+	bool := |a| > |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+internal_int_greater_than_abs :: #force_inline proc(a, b: ^Int) -> (greater_than: bool) {
+	return internal_cmp_mag(a, b) == 1;
+}
+
+internal_greater_than :: proc {
+	internal_int_greater_than,
+	internal_int_greater_than_digit,
+}
+internal_gt :: internal_greater_than;
+
+internal_greater_than_abs :: proc {
+	internal_int_greater_than_abs,
+}
+internal_gt_abs :: internal_greater_than_abs;
+
+
 /*
 	Check if remainders are possible squares - fast exclude non-squares.
 
diff --git a/core/math/big/public.odin b/core/math/big/public.odin
index 68c574905..f701663a7 100644
--- a/core/math/big/public.odin
+++ b/core/math/big/public.odin
@@ -561,6 +561,298 @@ int_compare_magnitude :: proc(a, b: ^Int, allocator := context.allocator) -> (re
 
 	return #force_inline internal_cmp_mag(a, b), nil;
 }
+int_cmp_mag :: int_compare_magnitude;
+
+
+/*
+	bool := a < b
+*/
+int_less_than :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (less_than: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c == -1, err;
+}
+
+/*
+	bool := a < b
+*/
+int_less_than_digit :: #force_inline proc(a: ^Int, b: DIGIT, allocator := context.allocator) -> (less_than: bool, err: Error) {
+	assert_if_nil(a);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c == -1, err;
+}
+
+/*
+	bool := |a| < |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+int_less_than_abs :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (less_than: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp_mag(a, b);
+
+	return c == -1, err;
+}
+
+less_than :: proc {
+	int_less_than,
+	int_less_than_digit,
+}
+lt :: less_than;
+
+less_than_abs :: proc {
+	int_less_than_abs,
+}
+lt_abs :: less_than_abs;
+
+
+/*
+	bool := a <= b
+*/
+int_less_than_or_equal :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (less_than_or_equal: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c <= 0, err;
+}
+
+/*
+	bool := a <= b
+*/
+int_less_than_or_equal_digit :: #force_inline proc(a: ^Int, b: DIGIT, allocator := context.allocator) -> (less_than_or_equal: bool, err: Error) {
+	assert_if_nil(a);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c <= 0, err;
+}
+
+/*
+	bool := |a| <= |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+int_less_than_or_equal_abs :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (less_than_or_equal: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp_mag(a, b);
+
+	return c <= 0, err;
+}
+
+less_than_or_equal :: proc {
+	int_less_than_or_equal,
+	int_less_than_or_equal_digit,
+}
+lteq :: less_than_or_equal;
+
+less_than_or_equal_abs :: proc {
+	int_less_than_or_equal_abs,
+}
+lteq_abs :: less_than_or_equal_abs;
+
+
+/*
+	bool := a == b
+*/
+int_equals :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (equals: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c == 0, err;
+}
+
+/*
+	bool := a == b
+*/
+int_equals_digit :: #force_inline proc(a: ^Int, b: DIGIT, allocator := context.allocator) -> (equals: bool, err: Error) {
+	assert_if_nil(a);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c == 0, err;
+}
+
+/*
+	bool := |a| == |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+int_equals_abs :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (equals: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp_mag(a, b);
+
+	return c == 0, err;
+}
+
+equals :: proc {
+	int_equals,
+	int_equals_digit,
+}
+eq :: equals;
+
+equals_abs :: proc {
+	int_equals_abs,
+}
+eq_abs :: equals_abs;
+
+
+/*
+	bool := a >= b
+*/
+int_greater_than_or_equal :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (greater_than_or_equal: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c >= 0, err;
+}
+
+/*
+	bool := a >= b
+*/
+int_greater_than_or_equal_digit :: #force_inline proc(a: ^Int, b: DIGIT, allocator := context.allocator) -> (greater_than_or_equal: bool, err: Error) {
+	assert_if_nil(a);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c >= 0, err;
+}
+
+/*
+	bool := |a| >= |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+int_greater_than_or_equal_abs :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (greater_than_or_equal: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp_mag(a, b);
+
+	return c >= 0, err;
+}
+
+greater_than_or_equal :: proc {
+	int_greater_than_or_equal,
+	int_greater_than_or_equal_digit,
+}
+gteq :: greater_than_or_equal;
+
+greater_than_or_equal_abs :: proc {
+	int_greater_than_or_equal_abs,
+}
+gteq_abs :: greater_than_or_equal_abs;
+
+
+/*
+	bool := a > b
+*/
+int_greater_than :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (greater_than: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c > 0, err;
+}
+
+/*
+	bool := a > b
+*/
+int_greater_than_digit :: #force_inline proc(a: ^Int, b: DIGIT, allocator := context.allocator) -> (greater_than: bool, err: Error) {
+	assert_if_nil(a);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a) or_return;
+
+	c: int;
+	c, err = cmp(a, b);
+
+	return c > 0, err;
+}
+
+/*
+	bool := |a| > |b|
+    Compares the magnitudes only, ignores the sign.
+*/
+int_greater_than_abs :: #force_inline proc(a, b: ^Int, allocator := context.allocator) -> (greater_than: bool, err: Error) {
+	assert_if_nil(a, b);
+	context.allocator = allocator;
+
+	internal_clear_if_uninitialized(a, b) or_return;
+
+	c: int;
+	c, err = cmp_mag(a, b);
+
+	return c > 0, err;
+}
+
+greater_than :: proc {
+	int_greater_than,
+	int_greater_than_digit,
+}
+gt :: greater_than;
+
+greater_than_abs :: proc {
+	int_greater_than_abs,
+}
+gt_abs :: greater_than_abs;
+
 
 /*
 	Check if remainders are possible squares - fast exclude non-squares.

From 671b413b15db1c0b4a5c04735b1b3bf047ce7fa9 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Wed, 1 Sep 2021 19:12:32 +0200
Subject: [PATCH 13/33] big: Use new comparison helpers.

---
 core/math/big/build.bat     |  5 ++--
 core/math/big/internal.odin | 29 +++++++++++----------
 core/math/big/prime.odin    |  2 +-
 core/math/big/private.odin  | 50 +++++++++++++++++--------------------
 core/math/big/test.py       |  1 -
 5 files changed, 41 insertions(+), 46 deletions(-)

diff --git a/core/math/big/build.bat b/core/math/big/build.bat
index e6049b2e1..6e7f3a166 100644
--- a/core/math/big/build.bat
+++ b/core/math/big/build.bat
@@ -1,9 +1,10 @@
 @echo off
-odin run . -vet
+:odin run . -vet
 
 set TEST_ARGS=-fast-tests
+:set TEST_ARGS=
 :odin build . -build-mode:shared -show-timings -o:minimal -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:size -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
-:odin build . -build-mode:shared -show-timings -o:speed -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
+odin build . -build-mode:shared -show-timings -o:speed -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:speed -define:MATH_BIG_EXE=false && python test.py -fast-tests %TEST_ARGS%
\ No newline at end of file
diff --git a/core/math/big/internal.odin b/core/math/big/internal.odin
index aa62569e7..0f66fcdaf 100644
--- a/core/math/big/internal.odin
+++ b/core/math/big/internal.odin
@@ -136,7 +136,7 @@ internal_int_add_signed :: proc(dest, a, b: ^Int, allocator := context.allocator
 		Subtract the one with the greater magnitude from the other.
 		The result gets the sign of the one with the greater magnitude.
 	*/
-	if #force_inline internal_cmp_mag(a, b) == -1 {
+	if #force_inline internal_lt_abs(a, b) {
 		x, y = y, x;
 	}
 
@@ -358,7 +358,7 @@ internal_int_sub_signed :: proc(dest, number, decrease: ^Int, allocator := conte
 		Subtract a positive from a positive, OR negative from a negative.
 		First, take the difference between their magnitudes, then...
 	*/
-	if #force_inline internal_cmp_mag(number, decrease) == -1 {
+	if #force_inline internal_lt_abs(number, decrease) {
 		/*
 			The second has a larger magnitude.
 			The result has the *opposite* sign from the first number.
@@ -718,7 +718,7 @@ internal_int_divmod :: proc(quotient, remainder, numerator, denominator: ^Int, a
 	/*
 		If numerator < denominator then quotient = 0, remainder = numerator.
 	*/
-	if #force_inline internal_cmp_mag(numerator, denominator) == -1 {
+	if #force_inline internal_lt_abs(numerator, denominator) {
 		if remainder != nil {
 			internal_copy(remainder, numerator) or_return;
 		}
@@ -731,7 +731,6 @@ internal_int_divmod :: proc(quotient, remainder, numerator, denominator: ^Int, a
 	if (denominator.used > 2 * MUL_KARATSUBA_CUTOFF) && (denominator.used <= (numerator.used / 3) * 2) {
 		assert(denominator.used >= 160 && numerator.used >= 240, "MUL_KARATSUBA_CUTOFF global not properly set.");
 		err = _private_int_div_recursive(quotient, remainder, numerator, denominator);
-		// err = #force_inline _private_int_div_school(quotient, remainder, numerator, denominator);
 	} else {
 		when true {
 			err = #force_inline _private_int_div_school(quotient, remainder, numerator, denominator);
@@ -1243,12 +1242,12 @@ internal_less_than_or_equal :: proc {
 	internal_int_less_than_or_equal,
 	internal_int_less_than_or_equal_digit,
 }
-internal_lteq :: internal_less_than_or_equal;
+internal_lte :: internal_less_than_or_equal;
 
 internal_less_than_or_equal_abs :: proc {
 	internal_int_less_than_or_equal_abs,
 }
-internal_lteq_abs :: internal_less_than_or_equal_abs;
+internal_lte_abs :: internal_less_than_or_equal_abs;
 
 
 /*
@@ -1311,12 +1310,12 @@ internal_greater_than_or_equal :: proc {
 	internal_int_greater_than_or_equal,
 	internal_int_greater_than_or_equal_digit,
 }
-internal_gteq :: internal_greater_than_or_equal;
+internal_gte :: internal_greater_than_or_equal;
 
 internal_greater_than_or_equal_abs :: proc {
 	internal_int_greater_than_or_equal_abs,
 }
-internal_gteq_abs :: internal_greater_than_or_equal_abs;
+internal_gte_abs :: internal_greater_than_or_equal_abs;
 
 
 /*
@@ -1410,7 +1409,7 @@ internal_int_is_square :: proc(a: ^Int, allocator := context.allocator) -> (squa
 	sqrt(t, a) or_return;
 	sqr(t, t)  or_return;
 
-	square = internal_cmp_mag(t, a) == 0;
+	square = internal_eq_abs(t, a);
 
 	return;
 }
@@ -1642,7 +1641,7 @@ internal_int_sqrt :: proc(dest, src: ^Int, allocator := context.allocator) -> (e
 		internal_add(t2, t1, x)  or_return;
 		internal_shr(y, t2, 1)   or_return;
 
-		if c := internal_cmp(y, x); c == 0 || c == 1 {
+		if internal_gte(y, x) {
 			internal_swap(dest, x);
 			return nil;
 		}
@@ -1757,8 +1756,8 @@ internal_int_root_n :: proc(dest, src: ^Int, n: int, allocator := context.alloca
 			 Number of rounds is at most log_2(root). If it is more it
 			 got stuck, so break out of the loop and do the rest manually.
 		*/
-		if ilog2 -= 1;    ilog2 == 0 { break; }
-		if internal_cmp(t1, t2) == 0 { break; }
+		if ilog2 -= 1; ilog2 == 0 { break; }
+		if internal_eq(t1, t2)    { break; }
 
 		iterations += 1;
 		if iterations == MAX_ITERATIONS_ROOT_N {
@@ -1796,7 +1795,7 @@ internal_int_root_n :: proc(dest, src: ^Int, n: int, allocator := context.alloca
 	for {
 		internal_pow(t2, t1, n) or_return;
 	
-		if internal_cmp(t2, a) != 1 { break; }
+		if internal_lt(t2, a) { break; }
 		
 		internal_sub(t1, t1, DIGIT(1)) or_return;
 
@@ -2001,12 +2000,12 @@ internal_int_inverse_modulo :: proc(dest, a, b: ^Int, allocator := context.alloc
 	/*
 		For all n in N and n > 0, n = 0 mod 1.
 	*/
-	if internal_is_positive(a) && internal_cmp(b, 1) == 0 { return internal_zero(dest);	}
+	if internal_is_positive(a) && internal_eq(b, 1) { return internal_zero(dest);	}
 
 	/*
 		`b` cannot be negative and has to be > 1
 	*/
-	if internal_is_negative(b) && internal_cmp(b, 1) != 1 { return .Invalid_Argument; }
+	if internal_is_negative(b) || internal_gt(b, 1) { return .Invalid_Argument; }
 
 	/*
 		If the modulus is odd we can use a faster routine instead.
diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index 182e2bffa..d6626ffbf 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -163,7 +163,7 @@ internal_int_kronecker :: proc(a, p: ^Int, allocator := context.allocator) -> (k
 
 	for {
 		if internal_is_zero(a1) {
-			if internal_cmp(p1, 1) == 0 {
+			if internal_eq(p1, 1) {
 				return k, nil;
 			} else {
 				return 0, nil;
diff --git a/core/math/big/private.odin b/core/math/big/private.odin
index 72f5bb9e2..fc2fe69e8 100644
--- a/core/math/big/private.odin
+++ b/core/math/big/private.odin
@@ -1071,11 +1071,11 @@ _private_int_div_school :: proc(quotient, remainder, numerator, denominator: ^In
 
 	internal_shl_digit(y, n - t) or_return;
 
-	c := internal_cmp(x, y);
-	for c != -1 {
+	gte := internal_gte(x, y);
+	for gte {
 		q.digit[n - t] += 1;
 		internal_sub(x, x, y) or_return;
-		c = internal_cmp(x, y);
+		gte = internal_gte(x, y);
 	}
 
 	/*
@@ -1134,7 +1134,7 @@ _private_int_div_school :: proc(quotient, remainder, numerator, denominator: ^In
 			t2.digit[2] = x.digit[i];
 			t2.used = 3;
 
-			if t1_t2 := internal_cmp_mag(t1, t2); t1_t2 != 1 {
+			if internal_lte(t1, t2) {
 				break;
 			}
 			iter += 1; if iter > 100 {
@@ -1227,15 +1227,13 @@ _private_div_recursion :: proc(quotient, remainder, a, b: ^Int, allocator := con
 	/*
 		While A1 < 0 do Q1 = Q1 - 1, A1 = A1 + (beta^k * B)
 	*/
-	if internal_cmp(A1, 0) == -1 {
+	if internal_lt(A1, 0) {
 		internal_shl(t, b, k * _DIGIT_BITS) or_return;
 
 		for {
 			internal_decr(Q1) or_return;
 			internal_add(A1, A1, t) or_return;
-			if internal_cmp(A1, 0) != -1 {
-				break;
-			}
+			if internal_gte(A1, 0) { break; }
 		}
 	}
 
@@ -1256,7 +1254,7 @@ _private_div_recursion :: proc(quotient, remainder, a, b: ^Int, allocator := con
 	/*
 		While A2 < 0 do Q0 = Q0 - 1, A2 = A2 + B.
 	*/
-	for internal_cmp(A2, 0) == -1 {
+	for internal_is_negative(A2) { // internal_lt(A2, 0) {
 		internal_decr(Q0) or_return;
 		internal_add(A2, A2, b) or_return;
 	}
@@ -1391,7 +1389,7 @@ _private_int_div_small :: proc(quotient, remainder, numerator, denominator: ^Int
 		shl(tq, tq, n)       or_return;
 
 		for n >= 0 {
-			if c, _ = cmp_mag(ta, tb); c == 0 || c == 1 {
+			if internal_gte(ta, tb) {
 				// ta -= tb
 				sub(ta, ta, tb) or_return;
 				//  q += tq
@@ -1596,7 +1594,7 @@ _private_int_gcd_lcm :: proc(res_gcd, res_lcm, a, b: ^Int, allocator := context.
 		/*
 			Make sure `v` is the largest.
 		*/
-		if internal_cmp_mag(u, v) == 1 {
+		if internal_gt(u, v) {
 			/*
 				Swap `u` and `v` to make sure `v` is >= `u`.
 			*/
@@ -1634,7 +1632,7 @@ _private_int_gcd_lcm :: proc(res_gcd, res_lcm, a, b: ^Int, allocator := context.
 		Computes least common multiple as `|a*b|/gcd(a,b)`
 		Divide the smallest by the GCD.
 	*/
-	if internal_cmp_mag(a, b) == -1 {
+	if internal_lt_abs(a, b) {
 		/*
 			Store quotient in `t2` such that `t2 * b` is the LCM.
 		*/
@@ -1694,9 +1692,7 @@ _private_int_log :: proc(a: ^Int, base: DIGIT, allocator := context.allocator) -
 		/*
 			Iterate until `a` is bracketed between low + high.
 		*/
-		if #force_inline internal_cmp(bracket_high, a) != -1 {
-			break;
-		}
+		if #force_inline internal_gte(bracket_high, a) { break; }
 
 		low = high;
 		#force_inline internal_copy(bracket_low, bracket_high) or_return;
@@ -1844,7 +1840,7 @@ _private_montgomery_reduce_comba :: proc(x, n: ^Int, rho: DIGIT, allocator := co
 	/*
 		if A >= m then A = A - m
 	*/
-	if internal_cmp_mag(x, n) != -1 {
+	if internal_gte_abs(x, n) {
 		return internal_sub(x, x, n);
 	}
 	return nil;
@@ -1932,7 +1928,7 @@ _private_int_montgomery_reduce :: proc(x, n: ^Int, rho: DIGIT, allocator := cont
 	/*
 		if x >= n then x = x - n
 	*/
-	if internal_cmp_mag(x, n) != -1 {
+	if internal_gte_abs(x, n) {
 		return internal_sub(x, x, n);
 	}
 
@@ -1969,7 +1965,7 @@ _private_int_montgomery_calc_normalization :: proc(a, b: ^Int, allocator := cont
 	*/
 	for x := bits - 1; x < _DIGIT_BITS; x += 1 {
 		internal_int_shl1(a, a)                                      or_return;
-		if internal_cmp_mag(a, b) != -1 {
+		if internal_gte_abs(a, b) {
 			internal_sub(a, a, b)                                    or_return;
 		}
 	}
@@ -2064,7 +2060,7 @@ _private_int_reduce :: proc(x, m, mu: ^Int, allocator := context.allocator) -> (
 	/*
 		If x < 0, add b**(k+1) to it.
 	*/
-	if internal_cmp(x, 0) == -1 {
+	if internal_is_negative(x) {
 		internal_set(q, 1)                                           or_return;
 		internal_shl_digit(q, um + 1)                                or_return;
 		internal_add(x, x, q)                                        or_return;
@@ -2073,7 +2069,7 @@ _private_int_reduce :: proc(x, m, mu: ^Int, allocator := context.allocator) -> (
 	/*
 		Back off if it's too big.
 	*/
-	for internal_cmp(x, m) != -1 {
+	for internal_gte(x, m) {
 		internal_sub(x, x, m)                                        or_return;
 	}
 
@@ -2110,7 +2106,7 @@ _private_int_reduce_2k :: proc(a, n: ^Int, d: DIGIT, allocator := context.alloca
 			a = a + q
 		*/
 		internal_add(a, a, q)                                        or_return;
-		if internal_cmp_mag(a, n) == -1                              { break; }
+		if internal_lt_abs(a, n)                                     { break; }
 		internal_sub(a, a, n)                                        or_return;
 	}
 
@@ -2146,7 +2142,7 @@ _private_int_reduce_2k_l :: proc(a, n, d: ^Int, allocator := context.allocator)
 			a = a + q
 		*/
 		internal_add(a, a, q)                                        or_return;
-		if internal_cmp_mag(a, n) == -1                              { break; }
+		if internal_lt_abs(a, n)                                     { break; }
 		internal_sub(a, a, n)                                        or_return;
 	}
 
@@ -2359,7 +2355,7 @@ _private_int_dr_reduce :: proc(x, n: ^Int, k: DIGIT, allocator := context.alloca
 			If x >= n then subtract and reduce again.
 			Each successive "recursion" makes the input smaller and smaller.
 		*/
-		if internal_cmp_mag(x, n) == -1 { break; }
+		if internal_lt_abs(x, n) { break; }
 
 		internal_sub(x, x, n)                                        or_return;
 	}
@@ -2985,21 +2981,21 @@ _private_inverse_modulo :: proc(dest, a, b: ^Int, allocator := context.allocator
 	/*
 		If `v` != `1` then there is no inverse.
 	*/
-	if internal_cmp(v, 1) != 0 {
+	if !internal_eq(v, 1) {
 		return .Invalid_Argument;
 	}
 
 	/*
 		If its too low.
 	*/
-	if internal_cmp(C, 0) == -1 {
+	if internal_is_negative(C) {
 		internal_add(C, C, b) or_return;
 	}
 
 	/*
 		Too big.
 	*/
-	if internal_cmp(C, 0) != -1 {
+	if internal_gte(C, 0) {
 		internal_sub(C, C, b) or_return;
 	}
 
@@ -3152,7 +3148,7 @@ _private_inverse_modulo_odd :: proc(dest, a, b: ^Int, allocator := context.alloc
 	/*
 		Too big.
 	*/
-	for internal_cmp_mag(D, b) != -1 {
+	for internal_gte_abs(D, b) {
 		internal_sub(D, D, b) or_return;
 	}
 
diff --git a/core/math/big/test.py b/core/math/big/test.py
index e095b061e..4d314401f 100644
--- a/core/math/big/test.py
+++ b/core/math/big/test.py
@@ -413,7 +413,6 @@ def test_shr_signed(a = 0, bits = 0, expected_error = Error.Okay):
 	return test("test_shr_signed", res, [a, bits], expected_error, expected_result)
 
 def test_factorial(number = 0, expected_error = Error.Okay):
-	print("Factorial:", number)
 	args  = [number]
 	try:
 		res = int_factorial(*args)

From ae354731edd1d408f1852c4fb19af2c840cbed46 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Wed, 1 Sep 2021 19:18:13 +0200
Subject: [PATCH 14/33] big: Add ; after proc map.

---
 core/math/big/build.bat     |  4 ++--
 core/math/big/internal.odin | 20 ++++++++++----------
 core/math/big/public.odin   | 20 ++++++++++----------
 3 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/core/math/big/build.bat b/core/math/big/build.bat
index 6e7f3a166..4a6aeeb3e 100644
--- a/core/math/big/build.bat
+++ b/core/math/big/build.bat
@@ -4,7 +4,7 @@
 set TEST_ARGS=-fast-tests
 :set TEST_ARGS=
 :odin build . -build-mode:shared -show-timings -o:minimal -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
-:odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
+odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:size -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
-odin build . -build-mode:shared -show-timings -o:speed -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
+:odin build . -build-mode:shared -show-timings -o:speed -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:speed -define:MATH_BIG_EXE=false && python test.py -fast-tests %TEST_ARGS%
\ No newline at end of file
diff --git a/core/math/big/internal.odin b/core/math/big/internal.odin
index 0f66fcdaf..72ff1fe76 100644
--- a/core/math/big/internal.odin
+++ b/core/math/big/internal.odin
@@ -1207,12 +1207,12 @@ internal_int_less_than_abs :: #force_inline proc(a, b: ^Int) -> (less_than: bool
 internal_less_than :: proc {
 	internal_int_less_than,
 	internal_int_less_than_digit,
-}
+};
 internal_lt :: internal_less_than;
 
 internal_less_than_abs :: proc {
 	internal_int_less_than_abs,
-}
+};
 internal_lt_abs :: internal_less_than_abs;
 
 
@@ -1241,12 +1241,12 @@ internal_int_less_than_or_equal_abs :: #force_inline proc(a, b: ^Int) -> (less_t
 internal_less_than_or_equal :: proc {
 	internal_int_less_than_or_equal,
 	internal_int_less_than_or_equal_digit,
-}
+};
 internal_lte :: internal_less_than_or_equal;
 
 internal_less_than_or_equal_abs :: proc {
 	internal_int_less_than_or_equal_abs,
-}
+};
 internal_lte_abs :: internal_less_than_or_equal_abs;
 
 
@@ -1275,12 +1275,12 @@ internal_int_equals_abs :: #force_inline proc(a, b: ^Int) -> (equals: bool) {
 internal_equals :: proc {
 	internal_int_equals,
 	internal_int_equals_digit,
-}
+};
 internal_eq :: internal_equals;
 
 internal_equals_abs :: proc {
 	internal_int_equals_abs,
-}
+};
 internal_eq_abs :: internal_equals_abs;
 
 
@@ -1309,12 +1309,12 @@ internal_int_greater_than_or_equal_abs :: #force_inline proc(a, b: ^Int) -> (gre
 internal_greater_than_or_equal :: proc {
 	internal_int_greater_than_or_equal,
 	internal_int_greater_than_or_equal_digit,
-}
+};
 internal_gte :: internal_greater_than_or_equal;
 
 internal_greater_than_or_equal_abs :: proc {
 	internal_int_greater_than_or_equal_abs,
-}
+};
 internal_gte_abs :: internal_greater_than_or_equal_abs;
 
 
@@ -1343,12 +1343,12 @@ internal_int_greater_than_abs :: #force_inline proc(a, b: ^Int) -> (greater_than
 internal_greater_than :: proc {
 	internal_int_greater_than,
 	internal_int_greater_than_digit,
-}
+};
 internal_gt :: internal_greater_than;
 
 internal_greater_than_abs :: proc {
 	internal_int_greater_than_abs,
-}
+};
 internal_gt_abs :: internal_greater_than_abs;
 
 
diff --git a/core/math/big/public.odin b/core/math/big/public.odin
index f701663a7..e8ff1e28b 100644
--- a/core/math/big/public.odin
+++ b/core/math/big/public.odin
@@ -613,12 +613,12 @@ int_less_than_abs :: #force_inline proc(a, b: ^Int, allocator := context.allocat
 less_than :: proc {
 	int_less_than,
 	int_less_than_digit,
-}
+};
 lt :: less_than;
 
 less_than_abs :: proc {
 	int_less_than_abs,
-}
+};
 lt_abs :: less_than_abs;
 
 
@@ -671,12 +671,12 @@ int_less_than_or_equal_abs :: #force_inline proc(a, b: ^Int, allocator := contex
 less_than_or_equal :: proc {
 	int_less_than_or_equal,
 	int_less_than_or_equal_digit,
-}
+};
 lteq :: less_than_or_equal;
 
 less_than_or_equal_abs :: proc {
 	int_less_than_or_equal_abs,
-}
+};
 lteq_abs :: less_than_or_equal_abs;
 
 
@@ -729,12 +729,12 @@ int_equals_abs :: #force_inline proc(a, b: ^Int, allocator := context.allocator)
 equals :: proc {
 	int_equals,
 	int_equals_digit,
-}
+};
 eq :: equals;
 
 equals_abs :: proc {
 	int_equals_abs,
-}
+};
 eq_abs :: equals_abs;
 
 
@@ -787,12 +787,12 @@ int_greater_than_or_equal_abs :: #force_inline proc(a, b: ^Int, allocator := con
 greater_than_or_equal :: proc {
 	int_greater_than_or_equal,
 	int_greater_than_or_equal_digit,
-}
+};
 gteq :: greater_than_or_equal;
 
 greater_than_or_equal_abs :: proc {
 	int_greater_than_or_equal_abs,
-}
+};
 gteq_abs :: greater_than_or_equal_abs;
 
 
@@ -845,12 +845,12 @@ int_greater_than_abs :: #force_inline proc(a, b: ^Int, allocator := context.allo
 greater_than :: proc {
 	int_greater_than,
 	int_greater_than_digit,
-}
+};
 gt :: greater_than;
 
 greater_than_abs :: proc {
 	int_greater_than_abs,
-}
+};
 gt_abs :: greater_than_abs;
 
 

From e639c61499c45d46c0d6e50adeb3dd6526b0fb89 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Wed, 1 Sep 2021 22:06:07 +0200
Subject: [PATCH 15/33] big: Add Miller-Rabin.

---
 core/math/big/prime.odin | 82 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index d6626ffbf..bd2dcef4d 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -204,6 +204,88 @@ internal_int_kronecker :: proc(a, p: ^Int, allocator := context.allocator) -> (k
 	return;
 }
 
+/*
+	Miller-Rabin test of "a" to the base of "b" as described in HAC pp. 139 Algorithm 4.24.
+
+	Sets result to 0 if definitely composite or 1 if probably prime.
+	Randomly the chance of error is no more than 1/4 and often very much lower.
+
+	Assumes `a` and `b` not to be `nil` and to have been initialized.
+*/
+internal_int_prime_miller_rabin :: proc(a, b: ^Int, allocator := context.allocator) -> (probably_prime: bool, err: Error) {
+	context.allocator = allocator;
+
+	n1, y, r := &Int{}, &Int{}, &Int{};
+	defer internal_destroy(n1, y, r);
+
+	/*
+		Ensure `b` > 1.
+	*/
+	if internal_gt(b, 1) { return false, nil; }
+
+	/*
+		Get n1 = a - 1.
+	*/
+	internal_copy(n1, a) or_return;
+	internal_sub(n1, n1, 1) or_return;
+
+	/*
+		Set 2**s * r = n1
+	*/
+	internal_copy(r, n1) or_return;
+
+	/*
+		Count the number of least significant bits which are zero.
+	*/
+	s := internal_count_lsb(r) or_return;
+
+	/*
+		Now divide n - 1 by 2**s.
+	*/
+	internal_shr(r, r, s) or_return;
+
+	/*
+		Compute y = b**r mod a.
+	*/
+	internal_int_exponent_mod(y, b, r, a) or_return;
+
+	/*
+		If y != 1 and y != n1 do.
+	*/
+	if !internal_eq(y, 1) && !internal_eq(y, n1) {
+		j := 1;
+
+		/*
+			While `j` <= `s` - 1 and `y` != `n1`.
+		*/
+		for j <= (s - 1) && !internal_eq(y, n1) {
+			internal_sqrmod(y, y, a) or_return;
+
+			/*
+				If `y` == 1 then composite.
+			*/
+			if internal_eq(y, 1) {
+				return false, nil;
+			}
+
+			j += 1;
+		}
+
+		/*
+			If `y` != `n1` then composite.
+		*/
+		if !internal_eq(y, n1) {
+			return false, nil;
+		}
+	}
+
+	/*
+		Probably prime now.
+	*/
+	return true, nil;
+}
+
+
 /*
 	Returns the number of Rabin-Miller trials needed for a given bit size.
 */

From 31918d3b8f63ea0b9d5c50cce73a4710167c03f4 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Thu, 2 Sep 2021 18:31:08 +0200
Subject: [PATCH 16/33] big: Add `internal_int_is_prime`.

---
 core/math/big/build.bat     |   4 +-
 core/math/big/common.odin   |  11 ++
 core/math/big/example.odin  |  48 ++++++++
 core/math/big/internal.odin |   2 +-
 core/math/big/prime.odin    | 233 +++++++++++++++++++++++++++++++++++-
 core/math/big/private.odin  |   2 +-
 core/math/big/radix.odin    |  16 +--
 core/math/big/tune.odin     |   1 +
 8 files changed, 299 insertions(+), 18 deletions(-)

diff --git a/core/math/big/build.bat b/core/math/big/build.bat
index 4a6aeeb3e..d9495e612 100644
--- a/core/math/big/build.bat
+++ b/core/math/big/build.bat
@@ -1,10 +1,10 @@
 @echo off
-:odin run . -vet
+odin run . -vet
 
 set TEST_ARGS=-fast-tests
 :set TEST_ARGS=
 :odin build . -build-mode:shared -show-timings -o:minimal -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
-odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
+:odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:size -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:speed -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:speed -define:MATH_BIG_EXE=false && python test.py -fast-tests %TEST_ARGS%
\ No newline at end of file
diff --git a/core/math/big/common.odin b/core/math/big/common.odin
index 4171d25f3..4d8224cd6 100644
--- a/core/math/big/common.odin
+++ b/core/math/big/common.odin
@@ -75,6 +75,17 @@ FACTORIAL_MAX_N       := 1_000_000;
 FACTORIAL_BINARY_SPLIT_CUTOFF         := 6100;
 FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS := 100;
 
+/*
+	`internal_int_is_prime` switchables.
+
+	Use Frobenius-Underwood for primality testing, or use Lucas-Selfridge (default).
+*/
+MATH_BIG_USE_FROBENIUS_TEST :: #config(MATH_BIG_USE_FROBENIUS_TEST, false);
+
+/*
+	Runtime tunable to use Miller-Rabin primality testing only and skip the above.
+*/
+USE_MILLER_RABIN_ONLY       := false;
 
 /*
 	We don't allow these to be switched at runtime for two reasons:
diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index 4da2ebbe9..49df357d6 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -26,6 +26,7 @@ Configuration:
 	_WARRAY                               %v
 	_TAB_SIZE                             %v
 	_MAX_WIN_SIZE                         %v
+	MATH_BIG_USE_FROBENIUS_TEST           %v
 Runtime tunable:
 	MUL_KARATSUBA_CUTOFF                  %v
 	SQR_KARATSUBA_CUTOFF                  %v
@@ -35,6 +36,7 @@ Runtime tunable:
 	FACTORIAL_MAX_N                       %v
 	FACTORIAL_BINARY_SPLIT_CUTOFF         %v
 	FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS %v
+	USE_MILLER_RABIN_ONLY                 %v
 
 `, _DIGIT_BITS,
 _LOW_MEMORY,
@@ -45,6 +47,8 @@ _MAX_COMBA,
 _WARRAY,
 _TAB_SIZE,
 _MAX_WIN_SIZE,
+MATH_BIG_USE_FROBENIUS_TEST,
+
 MUL_KARATSUBA_CUTOFF,
 SQR_KARATSUBA_CUTOFF,
 MUL_TOOM_CUTOFF,
@@ -53,6 +57,7 @@ MAX_ITERATIONS_ROOT_N,
 FACTORIAL_MAX_N,
 FACTORIAL_BINARY_SPLIT_CUTOFF,
 FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS,
+USE_MILLER_RABIN_ONLY,
 );
 
 }
@@ -84,6 +89,49 @@ print :: proc(name: string, a: ^Int, base := i8(10), print_name := true, newline
 demo :: proc() {
 	a, b, c, d, e, f, res := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
 	defer destroy(a, b, c, d, e, f, res);
+
+	err: Error;
+	prime: bool;
+
+	foo := [4]f64{1, 2, 4, 5};
+	fmt.println(foo.rrr);
+
+	trials := 15;
+
+	{
+		SCOPED_TIMING(.is_prime);
+		for p in _private_prime_table[2:] {
+
+			set(a, p);
+			prime, err = internal_int_is_prime(a, trials);
+			if !prime || err != nil {
+				fmt.printf("%v wrongly flagged as composite\n", p);
+			}
+
+			set(a, p - 1);
+			prime, err = internal_int_is_prime(a, trials);
+			if prime || err != nil {
+				fmt.printf("%v wrongly flagged as prime\n", p);
+			}
+
+			set(a, p + 1);
+			prime, err = internal_int_is_prime(a, trials);
+			if prime || err != nil {
+				fmt.printf("%v wrongly flagged as prime\n", p);
+			}
+		}
+	}
+	Timings[.is_prime].count = len(_private_prime_table[2:]) * 3;
+
+	internal_set(a, "3317044064679887385961981");
+
+	{
+		SCOPED_TIMING(.is_prime);
+		prime, err = internal_int_is_prime(a, trials);
+		if prime || err != nil {
+			print("Wrongly flagged as prime: ", a);
+		}
+	}
 }
 
 main :: proc() {
diff --git a/core/math/big/internal.odin b/core/math/big/internal.odin
index 72ff1fe76..c603dcdd8 100644
--- a/core/math/big/internal.odin
+++ b/core/math/big/internal.odin
@@ -1871,7 +1871,7 @@ internal_int_set_from_integer :: proc(dest: ^Int, src: $T, minimize := false, al
 	return nil;
 }
 
-internal_set :: proc { internal_int_set_from_integer, internal_int_copy };
+internal_set :: proc { internal_int_set_from_integer, internal_int_copy, int_atoi };
 
 internal_copy_digits :: #force_inline proc(dest, src: ^Int, digits: int, offset := int(0)) -> (err: Error) {
 	#force_inline internal_error_if_immutable(dest) or_return;
diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index bd2dcef4d..f97d4fe81 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -10,6 +10,8 @@
 */
 package math_big
 
+import rnd "core:math/rand";
+
 /*
 	Determines if an Integer is divisible by one of the _PRIME_TABLE primes.
 	Returns true if it is, false if not. 
@@ -207,7 +209,7 @@ internal_int_kronecker :: proc(a, p: ^Int, allocator := context.allocator) -> (k
 /*
 	Miller-Rabin test of "a" to the base of "b" as described in HAC pp. 139 Algorithm 4.24.
 
-	Sets result to 0 if definitely composite or 1 if probably prime.
+	Sets result to `false` if definitely composite or `true` if probably prime.
 	Randomly the chance of error is no more than 1/4 and often very much lower.
 
 	Assumes `a` and `b` not to be `nil` and to have been initialized.
@@ -224,13 +226,13 @@ internal_int_prime_miller_rabin :: proc(a, b: ^Int, allocator := context.allocat
 	if internal_gt(b, 1) { return false, nil; }
 
 	/*
-		Get n1 = a - 1.
+		Get `n1` = `a` - 1.
 	*/
 	internal_copy(n1, a) or_return;
 	internal_sub(n1, n1, 1) or_return;
 
 	/*
-		Set 2**s * r = n1
+		Set `2`**`s` * `r` = `n1`
 	*/
 	internal_copy(r, n1) or_return;
 
@@ -240,17 +242,17 @@ internal_int_prime_miller_rabin :: proc(a, b: ^Int, allocator := context.allocat
 	s := internal_count_lsb(r) or_return;
 
 	/*
-		Now divide n - 1 by 2**s.
+		Now divide `n` - 1 by `2`**`s`.
 	*/
 	internal_shr(r, r, s) or_return;
 
 	/*
-		Compute y = b**r mod a.
+		Compute `y` = `b`**`r` mod `a`.
 	*/
 	internal_int_exponent_mod(y, b, r, a) or_return;
 
 	/*
-		If y != 1 and y != n1 do.
+		If `y` != 1 and `y` != `n1` do.
 	*/
 	if !internal_eq(y, 1) && !internal_eq(y, n1) {
 		j := 1;
@@ -285,6 +287,225 @@ internal_int_prime_miller_rabin :: proc(a, b: ^Int, allocator := context.allocat
 	return true, nil;
 }
 
+/*
+	`a` is the big Int to test for primality.
+
+	`miller_rabin_trials` can be one of the following:
+		< 0:	For `a` up to 3_317_044_064_679_887_385_961_981, set `miller_rabin_trials` to negative to run a predetermined
+				number of trials for a deterministic answer.
+		= 0:	Run Miller-Rabin with bases 2, 3 and one random base < `a`. Non-deterministic.
+		> 0:	Run Miller-Rabin with bases 2, 3 and `miller_rabin_trials` number of random bases. Non-deterministic.
+
+	`miller_rabin_only`:
+		`false`	Also use either Frobenius-Underwood or Lucas-Selfridge, depending on the compile-time `MATH_BIG_USE_FROBENIUS_TEST` choice.
+		`true`	Run Rabin-Miller trials but skip Frobenius-Underwood / Lucas-Selfridge.
+
+	`r` takes a pointer to an instance of `core:math/rand`'s `Rand` and may be `nil` to use the global one.
+
+	Returns `is_prime` (bool), where:
+		`false`	Definitively composite.
+		`true`	Probably prime if `miller_rabin_trials` >= 0, with increasing certainty with more trials.
+				Deterministically prime if `miller_rabin_trials` = 0 for `a` up to 3_317_044_064_679_887_385_961_981.
+
+	Assumes `a` not to be `nil` and to have been initialized.
+*/
+internal_int_is_prime :: proc(a: ^Int, miller_rabin_trials := int(-1), miller_rabin_only := USE_MILLER_RABIN_ONLY, r: ^rnd.Rand = nil, allocator := context.allocator) -> (is_prime: bool, err: Error) {
+	context.allocator = allocator;
+	miller_rabin_trials := miller_rabin_trials;
+
+	// Default to `no`.
+	is_prime = false;
+
+	b, res := &Int{}, &Int{};
+	defer internal_destroy(b, res);
+
+	// Some shortcuts
+	// `N` > 3
+	if a.used == 1 {
+		if a.digit[0] == 0 || a.digit[0] == 1 {
+			return;
+		}
+		if a.digit[0] == 2 {
+			return true, nil;
+		}
+	}
+
+	// `N` must be odd.
+	if internal_is_even(a) {
+		return;
+	}
+
+	// `N` is not a perfect square: floor(sqrt(`N`))^2 != `N` 
+	if internal_int_is_square(a) or_return { return; }
+
+	// Is the input equal to one of the primes in the table?
+	for p in _private_prime_table {
+		if internal_eq(a, p) {
+			return true, nil;
+		}
+	}
+
+	// First perform trial division
+	if internal_int_prime_is_divisible(a) or_return { return; }
+
+	// Run the Miller-Rabin test with base 2 for the BPSW test.
+	internal_set(b, 2) or_return;
+	if !internal_int_prime_miller_rabin(a, b) or_return { return; }
+
+	// Rumours have it that Mathematica does a second M-R test with base 3.
+	// Other rumours have it that their strong L-S test is slightly different.
+	// It does not hurt, though, beside a bit of extra runtime.
+
+	b.digit[0] += 1;
+	if !internal_int_prime_miller_rabin(a, b) or_return { return; }
+
+	// Both, the Frobenius-Underwood test and the the Lucas-Selfridge test are quite
+	// slow so if speed is an issue, set `USE_MILLER_RABIN_ONLY` to use M-R tests with
+	// bases 2, 3 and t random bases.
+
+	if !miller_rabin_only {
+		if miller_rabin_trials >= 0 {
+			when MATH_BIG_USE_FROBENIUS_TEST {
+//				err = mp_prime_frobenius_underwood(a, &res);
+//				if ((err != MP_OKAY) && (err != MP_ITER)) {
+//					goto LBL_B;
+//				}
+//				if (!res) {
+//					goto LBL_B;
+//				}
+			} else {
+//				if ((err = mp_prime_strong_lucas_selfridge(a, &res)) != MP_OKAY) {
+//					goto LBL_B;
+//				}
+//				if (!res) {
+//					goto LBL_B;
+//				}
+			}
+		}
+	}
+
+	// Run at least one Miller-Rabin test with a random base.
+	// Don't replace this with `min`, because we try known deterministic bases
+	//     for certain sized inputs when `miller_rabin_trials` is negative.
+	if miller_rabin_trials == 0 {
+		miller_rabin_trials = 1;
+	}
+
+	// Only recommended if the input range is known to be < 3_317_044_064_679_887_385_961_981
+	// It uses the bases necessary for a deterministic M-R test if the input is	smaller than 3_317_044_064_679_887_385_961_981
+	// The caller has to check the size.
+	// TODO: can be made a bit finer grained but comparing is not free.
+
+	if miller_rabin_trials < 0 {
+		p_max := 0;
+
+		// Sorenson, Jonathan; Webster, Jonathan (2015), "Strong Pseudoprimes to Twelve Prime Bases".
+
+		// 0x437ae92817f9fc85b7e5 = 318_665_857_834_031_151_167_461
+		atoi(b, "437ae92817f9fc85b7e5", 16) or_return;
+		if internal_lt(a, b) {
+			p_max = 12;
+		} else {
+			/* 0x2be6951adc5b22410a5fd = 3_317_044_064_679_887_385_961_981 */
+			atoi(b, "2be6951adc5b22410a5fd", 16) or_return;
+			if internal_lt(a, b) {
+				p_max = 13;
+			} else {
+				return false, .Invalid_Argument;
+			}
+		}
+
+		// We did bases 2 and 3  already, skip them
+		for ix := 2; ix < p_max; ix += 1 {
+			internal_set(b, _private_prime_table[ix]);
+			if !internal_int_prime_miller_rabin(a, b) or_return { return; }
+		}
+	} else if miller_rabin_trials > 0 {
+		// Perform `miller_rabin_trials` M-R tests with random bases between 3 and "a".
+		// See Fips 186.4 p. 126ff
+
+		// The DIGITs have a defined bit-size but the size of a.digit is a simple 'int',
+		// the size of which can depend on the platform.
+		size_a := internal_count_bits(a);
+		mask   := (1 << uint(ilog2(size_a))) - 1;
+
+		/*
+			Assuming the General Rieman hypothesis (never thought to write that in a
+			comment) the upper bound can be lowered to  2*(log a)^2.
+			E. Bach, "Explicit bounds for primality testing and related problems,"
+			Math. Comp. 55 (1990), 355-380.
+
+				size_a = (size_a/10) * 7;
+				len = 2 * (size_a * size_a);
+
+			E.g.: a number of size 2^2048 would be reduced to the upper limit
+
+				floor(2048/10)*7 = 1428
+				2 * 1428^2       = 4078368
+
+			(would have been ~4030331.9962 with floats and natural log instead)
+			That number is smaller than 2^28, the default bit-size of DIGIT on 32-bit platforms.
+		*/
+
+		/*
+			How many tests, you might ask? Dana Jacobsen of Math::Prime::Util fame
+			does exactly 1. In words: one. Look at the end of _GMP_is_prime() in
+			Math-Prime-Util-GMP-0.50/primality.c if you do not believe it.
+
+			The function rand() goes to some length to use a cryptographically
+			good PRNG. That also means that the chance to always get the same base
+			in the loop is non-zero, although very low.
+			-- NOTE(Jeroen): This is not yet true in Odin, but I have some ideas.
+
+			If the BPSW test and/or the addtional Frobenious test have been
+			performed instead of just the Miller-Rabin test with the bases 2 and 3,
+			a single extra test should suffice, so such a very unlikely event will not do much harm.
+
+			To preemptivly answer the dangling question: no, a witness does not	need to be prime.
+		*/
+		for ix := 0; ix < miller_rabin_trials; ix += 1 {
+
+			// rand() guarantees the first digit to be non-zero
+			internal_rand(b, _DIGIT_TYPE_BITS, r) or_return;
+
+			// Reduce digit before casting because DIGIT might be bigger than
+			// an unsigned int and "mask" on the other side is most probably not.
+			l: int;
+
+			fips_rand := (uint)(b.digit[0] & DIGIT(mask));
+			if fips_rand > (uint)(max(int) - _DIGIT_BITS) {
+				l = max(int) / _DIGIT_BITS;
+			} else {
+				l = (int(fips_rand) + _DIGIT_BITS) / _DIGIT_BITS;
+			}
+
+			// Unlikely.
+			if (l < 0) {
+				ix -= 1;
+				continue;
+			}
+			internal_rand(b, l) or_return;
+
+			// That number might got too big and the witness has to be smaller than "a"
+			l = internal_count_bits(b);
+			if l >= size_a {
+				l = (l - size_a) + 1;
+				internal_shr(b, b, l) or_return;
+			}
+
+			// Although the chance for b <= 3 is miniscule, try again.
+			if internal_lte(b, 3) {
+				ix -= 1;
+				continue;
+			}
+			if !internal_int_prime_miller_rabin(a, b) or_return { return; }
+		}
+	}
+
+	// Passed the test.
+	return true, nil;
+}
+
 
 /*
 	Returns the number of Rabin-Miller trials needed for a given bit size.
diff --git a/core/math/big/private.odin b/core/math/big/private.odin
index fc2fe69e8..002dbda09 100644
--- a/core/math/big/private.odin
+++ b/core/math/big/private.odin
@@ -1373,7 +1373,7 @@ _private_int_div_recursive :: proc(quotient, remainder, a, b: ^Int, allocator :=
 _private_int_div_small :: proc(quotient, remainder, numerator, denominator: ^Int) -> (err: Error) {
 
 	ta, tb, tq, q := &Int{}, &Int{}, &Int{}, &Int{};
-	c: int;
+
 	defer internal_destroy(ta, tb, tq, q);
 
 	for {
diff --git a/core/math/big/radix.odin b/core/math/big/radix.odin
index 8a7040158..76854e244 100644
--- a/core/math/big/radix.odin
+++ b/core/math/big/radix.odin
@@ -413,14 +413,14 @@ _log_bases :: [65]u32{
 */
 RADIX_TABLE := "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
 RADIX_TABLE_REVERSE := [RADIX_TABLE_REVERSE_SIZE]u8{
-   0x3e, 0xff, 0xff, 0xff, 0x3f, 0x00, 0x01, 0x02, 0x03, 0x04, /* +,-./01234 */
-   0x05, 0x06, 0x07, 0x08, 0x09, 0xff, 0xff, 0xff, 0xff, 0xff, /* 56789:;<=> */
-   0xff, 0xff, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, /* ?@ABCDEFGH */
-   0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, /* IJKLMNOPQR */
-   0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0xff, 0xff, /* STUVWXYZ[\ */
-   0xff, 0xff, 0xff, 0xff, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, /* ]^_`abcdef */
-   0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, /* ghijklmnop */
-   0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, /* qrstuvwxyz */
+	0x3e, 0xff, 0xff, 0xff, 0x3f, 0x00, 0x01, 0x02, 0x03, 0x04, /* +,-./01234 */
+	0x05, 0x06, 0x07, 0x08, 0x09, 0xff, 0xff, 0xff, 0xff, 0xff, /* 56789:;<=> */
+	0xff, 0xff, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, /* ?@ABCDEFGH */
+	0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, /* IJKLMNOPQR */
+	0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0xff, 0xff, /* STUVWXYZ[\ */
+	0xff, 0xff, 0xff, 0xff, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, /* ]^_`abcdef */
+	0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, /* ghijklmnop */
+	0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, /* qrstuvwxyz */
 };
 RADIX_TABLE_REVERSE_SIZE :: 80;
 
diff --git a/core/math/big/tune.odin b/core/math/big/tune.odin
index 3381065bb..ced8e5f5d 100644
--- a/core/math/big/tune.odin
+++ b/core/math/big/tune.odin
@@ -23,6 +23,7 @@ Category :: enum {
 	sqr,
 	bitfield_extract,
 	rm_trials,
+	is_prime,
 };
 
 Event :: struct {

From 7fa04fa018297bafb148533581bf9756b560c256 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Thu, 2 Sep 2021 19:59:59 +0200
Subject: [PATCH 17/33] big: Fix M-R.

---
 core/math/big/example.odin | 39 +++++---------------------------------
 core/math/big/prime.odin   |  8 ++++----
 2 files changed, 9 insertions(+), 38 deletions(-)

diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index 49df357d6..b2e3f82bd 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -93,45 +93,16 @@ demo :: proc() {
 	err: Error;
 	prime: bool;
 
-	foo := [4]f64{1, 2, 4, 5};
-	fmt.println(foo.rrr);
+	trials := 1;
 
-	trials := 15;
+	set(c, "3317044064679887385961981");
 
 	{
 		SCOPED_TIMING(.is_prime);
-		for p in _private_prime_table[2:] {
-
-			set(a, p);
-			prime, err = internal_int_is_prime(a, trials);
-			if !prime || err != nil {
-				fmt.printf("%v wrongly flagged as composite\n", p);
-			}
-
-			set(a, p - 1);
-			prime, err = internal_int_is_prime(a, trials);
-			if prime || err != nil {
-				fmt.printf("%v wrongly flagged as prime\n", p);
-			}
-
-			set(a, p + 1);
-			prime, err = internal_int_is_prime(a, trials);
-			if prime || err != nil {
-				fmt.printf("%v wrongly flagged as prime\n", p);
-			}
-		}
-	}
-	Timings[.is_prime].count = len(_private_prime_table[2:]) * 3;
-
-	internal_set(a, "3317044064679887385961981");
-
-	{
-		SCOPED_TIMING(.is_prime);
-		prime, err = internal_int_is_prime(a, trials);
-		if prime || err != nil {
-			print("Wrongly flagged as prime: ", a);
-		}
+		prime, err = internal_int_is_prime(c, trials);
 	}
+	//print("prime: ", c);
+	fmt.printf("%v %v\n", prime, err);
 }
 
 main :: proc() {
diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index f97d4fe81..47a9b2974 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -223,7 +223,7 @@ internal_int_prime_miller_rabin :: proc(a, b: ^Int, allocator := context.allocat
 	/*
 		Ensure `b` > 1.
 	*/
-	if internal_gt(b, 1) { return false, nil; }
+	if internal_lte(b, 1) { return false, nil; }
 
 	/*
 		Get `n1` = `a` - 1.
@@ -291,10 +291,10 @@ internal_int_prime_miller_rabin :: proc(a, b: ^Int, allocator := context.allocat
 	`a` is the big Int to test for primality.
 
 	`miller_rabin_trials` can be one of the following:
-		< 0:	For `a` up to 3_317_044_064_679_887_385_961_981, set `miller_rabin_trials` to negative to run a predetermined
+		`< 0`:	For `a` up to 3_317_044_064_679_887_385_961_981, set `miller_rabin_trials` to negative to run a predetermined
 				number of trials for a deterministic answer.
-		= 0:	Run Miller-Rabin with bases 2, 3 and one random base < `a`. Non-deterministic.
-		> 0:	Run Miller-Rabin with bases 2, 3 and `miller_rabin_trials` number of random bases. Non-deterministic.
+		`= 0`:	Run Miller-Rabin with bases 2, 3 and one random base < `a`. Non-deterministic.
+		`> 0`:	Run Miller-Rabin with bases 2, 3 and `miller_rabin_trials` number of random bases. Non-deterministic.
 
 	`miller_rabin_only`:
 		`false`	Also use either Frobenius-Underwood or Lucas-Selfridge, depending on the compile-time `MATH_BIG_USE_FROBENIUS_TEST` choice.

From eecc786bd2d7d3717b84d2828f535e19c92fc66b Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Fri, 3 Sep 2021 01:25:18 +0200
Subject: [PATCH 18/33] big: Add Frobenius-Underwood.

---
 core/math/big/build.bat     |   2 +-
 core/math/big/example.odin  |  15 +++--
 core/math/big/internal.odin |  12 +++-
 core/math/big/prime.odin    | 106 +++++++++++++++++++++++++++++++++---
 4 files changed, 117 insertions(+), 18 deletions(-)

diff --git a/core/math/big/build.bat b/core/math/big/build.bat
index d9495e612..43ece1054 100644
--- a/core/math/big/build.bat
+++ b/core/math/big/build.bat
@@ -1,5 +1,5 @@
 @echo off
-odin run . -vet
+odin run . -vet -define:MATH_BIG_USE_FROBENIUS_TEST=true
 
 set TEST_ARGS=-fast-tests
 :set TEST_ARGS=
diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index b2e3f82bd..fb1e51053 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -84,7 +84,7 @@ print :: proc(name: string, a: ^Int, base := i8(10), print_name := true, newline
 	}
 }
 
-// printf :: fmt.printf;
+printf :: fmt.printf;
 
 demo :: proc() {
 	a, b, c, d, e, f, res := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
@@ -93,16 +93,15 @@ demo :: proc() {
 	err: Error;
 	prime: bool;
 
-	trials := 1;
-
-	set(c, "3317044064679887385961981");
-
+	set(a, "3317044064679887385961981"); // Composite: 1287836182261 × 2575672364521
+	trials := number_of_rabin_miller_trials(internal_count_bits(a));
 	{
 		SCOPED_TIMING(.is_prime);
-		prime, err = internal_int_is_prime(c, trials);
+		prime, err = internal_int_is_prime(a, trials);
 	}
-	//print("prime: ", c);
-	fmt.printf("%v %v\n", prime, err);
+	print("Candidate prime: ", a);
+	fmt.printf("%v Miller-Rabin trials needed.\n", trials);
+	fmt.printf("Is prime: %v, Error: %v\n", prime, err);
 }
 
 main :: proc() {
diff --git a/core/math/big/internal.odin b/core/math/big/internal.odin
index c603dcdd8..6ae2f4284 100644
--- a/core/math/big/internal.odin
+++ b/core/math/big/internal.odin
@@ -2019,8 +2019,18 @@ internal_invmod :: proc{ internal_int_inverse_modulo, };
 /*
 	Helpers to extract values from the `Int`.
 */
+internal_int_bitfield_extract_bool :: proc(a: ^Int, offset: int) -> (val: bool, err: Error) {
+	limb := offset / _DIGIT_BITS;
+	if limb < 0 || limb >= a.used  { return false, .Invalid_Argument; }
+	i := _WORD(1 << _WORD((offset % _DIGIT_BITS)));
+	return bool(_WORD(a.digit[limb]) & i), nil;
+}
+
 internal_int_bitfield_extract_single :: proc(a: ^Int, offset: int) -> (bit: _WORD, err: Error) {
-	return #force_inline int_bitfield_extract(a, offset, 1);
+	limb := offset / _DIGIT_BITS;
+	if limb < 0 || limb >= a.used  { return 0, .Invalid_Argument; }
+	i := _WORD(1 << _WORD((offset % _DIGIT_BITS)));
+	return 1 if ((_WORD(a.digit[limb]) & i) != 0) else 0, nil;
 }
 
 internal_int_bitfield_extract :: proc(a: ^Int, offset, count: int) -> (res: _WORD, err: Error) #no_bounds_check {
diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index 47a9b2974..48c72de0d 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -366,13 +366,7 @@ internal_int_is_prime :: proc(a: ^Int, miller_rabin_trials := int(-1), miller_ra
 	if !miller_rabin_only {
 		if miller_rabin_trials >= 0 {
 			when MATH_BIG_USE_FROBENIUS_TEST {
-//				err = mp_prime_frobenius_underwood(a, &res);
-//				if ((err != MP_OKAY) && (err != MP_ITER)) {
-//					goto LBL_B;
-//				}
-//				if (!res) {
-//					goto LBL_B;
-//				}
+				if !internal_int_prime_frobenius_underwood(a) or_return { return; }
 			} else {
 //				if ((err = mp_prime_strong_lucas_selfridge(a, &res)) != MP_OKAY) {
 //					goto LBL_B;
@@ -506,6 +500,102 @@ internal_int_is_prime :: proc(a: ^Int, miller_rabin_trials := int(-1), miller_ra
 	return true, nil;
 }
 
+/*
+ * floor of positive solution of (2^16) - 1 = (a + 4) * (2 * a + 5)
+ * TODO: Both values are smaller than N^(1/4), would have to use a bigint
+ *       for `a` instead, but any `a` bigger than about 120 are already so rare that
+ *       it is possible to ignore them and still get enough pseudoprimes.
+ *       But it is still a restriction of the set of available pseudoprimes
+ *       which makes this implementation less secure if used stand-alone.
+ */
+_FROBENIUS_UNDERWOOD_A :: 32764;
+
+internal_int_prime_frobenius_underwood :: proc(N: ^Int, allocator := context.allocator) -> (result: bool, err: Error) {
+	context.allocator = allocator;
+
+	T1z, T2z, Np1z, sz, tz := &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
+	defer internal_destroy(T1z, T2z, Np1z, sz, tz);
+
+	internal_init_multi(T1z, T2z, Np1z, sz, tz) or_return;
+
+	a, ap2: int;
+
+	frob: for a = 0; a < _FROBENIUS_UNDERWOOD_A; a += 1 {
+		switch a {
+		case 2, 4, 7, 8, 10, 14, 18, 23, 26, 28:
+			continue frob;
+		}
+
+		internal_set(T1z, i32((a * a) - 4));
+		j := internal_int_kronecker(T1z, N) or_return;
+
+		switch j {
+		case -1: break frob;
+		case  0: return false, nil;
+		}
+	}
+
+	// Tell it a composite and set return value accordingly.
+	if a >= _FROBENIUS_UNDERWOOD_A { return false, .Max_Iterations_Reached; }
+
+	// Composite if N and (a+4)*(2*a+5) are not coprime.
+	internal_set(T1z, u32((a + 4) * ((2 * a) + 5)));
+	internal_int_gcd_lcm(T1z, nil, T1z, N) or_return;
+
+	if !(T1z.used == 1 && T1z.digit[0] == 1) {
+		// Composite.
+		return false, nil;
+	}
+
+	ap2 = a + 2;
+	internal_add(Np1z, N, 1) or_return;
+
+	internal_set(sz, 1) or_return;
+	internal_set(tz, 2) or_return;
+
+	for i := internal_count_bits(Np1z) - 2; i >= 0; i -= 1 {
+		// temp = (sz * (a * sz + 2 * tz)) % N;
+		// tz   = ((tz - sz) * (tz + sz)) % N;
+		// sz   = temp;
+
+		internal_int_shl1(T2z, tz) or_return;
+
+		// a = 0 at about 50% of the cases (non-square and odd input)
+		if a != 0 {
+			internal_mul(T1z, sz, DIGIT(a)) or_return;
+			internal_add(T2z, T2z, T1z) or_return;
+		}
+
+		internal_mul(T1z, T2z, sz) or_return;
+		internal_sub(T2z, tz, sz) or_return;
+		internal_add(sz, sz, tz) or_return;
+		internal_mul(tz, sz, T2z) or_return;
+		internal_mod(tz, tz, N) or_return;
+		internal_mod(sz, T1z, N) or_return;
+
+		if bit, _ := internal_int_bitfield_extract_bool(Np1z, i); bit {
+			// temp = (a+2) * sz + tz
+			// tz   = 2 * tz - sz
+			// sz   = temp
+			if a == 0 {
+				internal_int_shl1(T1z, sz) or_return;
+			} else {
+				internal_mul(T1z, sz, DIGIT(ap2)) or_return;
+			}
+			internal_add(T1z, T1z, tz) or_return;
+			internal_int_shl1(T2z, tz) or_return;
+			internal_sub(tz, T2z, sz);
+			internal_swap(sz, T1z);
+		}
+	}
+
+	internal_set(T1z, u32((2 * a) + 5)) or_return;
+	internal_mod(T1z, T1z, N) or_return;
+
+	result = internal_is_zero(sz) && internal_eq(tz, T1z);
+
+	return;
+}
 
 /*
 	Returns the number of Rabin-Miller trials needed for a given bit size.
@@ -513,7 +603,7 @@ internal_int_is_prime :: proc(a: ^Int, miller_rabin_trials := int(-1), miller_ra
 number_of_rabin_miller_trials :: proc(bit_size: int) -> (number_of_trials: int) {
 	switch {
 	case bit_size <=    80:
-		return - 1;		/* Use deterministic algorithm for size <= 80 bits */
+		return -1;		/* Use deterministic algorithm for size <= 80 bits */
 	case bit_size >=    81 && bit_size <     96:
 		return 37;		/* max. error = 2^(-96)  */
 	case bit_size >=    96 && bit_size <    128:

From 7ed4f01d02b311bb2d2854ae5556fbea70af6dda Mon Sep 17 00:00:00 2001
From: gingerBill <bill@gingerbill.org>
Date: Fri, 3 Sep 2021 11:02:31 +0100
Subject: [PATCH 19/33] Add `reflect.equal` and `reflect.not_equal`

---
 core/reflect/reflect.odin | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/core/reflect/reflect.odin b/core/reflect/reflect.odin
index a7aec0d19..881f32b6e 100644
--- a/core/reflect/reflect.odin
+++ b/core/reflect/reflect.odin
@@ -1276,7 +1276,9 @@ as_raw_data :: proc(a: any) -> (value: rawptr, valid: bool) {
 	return;
 }
 
-/*
+eq :: equal;
+ne :: not_equal;
+
 not_equal :: proc(a, b: any) -> bool {
 	return !equal(a, b);
 }
@@ -1353,4 +1355,3 @@ equal :: proc(a, b: any) -> bool {
 
 	return true;
 }
-*/

From 11ae87cc2fec12d5bda0052ecc29d91d60b68a22 Mon Sep 17 00:00:00 2001
From: gingerBill <bill@gingerbill.org>
Date: Fri, 3 Sep 2021 12:00:43 +0100
Subject: [PATCH 20/33] Add `including_indirect_array_recursion` argument to
 `reflect.equal`

---
 core/reflect/reflect.odin | 69 +++++++++++++++++++++++++++++++++++----
 1 file changed, 62 insertions(+), 7 deletions(-)

diff --git a/core/reflect/reflect.odin b/core/reflect/reflect.odin
index 881f32b6e..6e2748e88 100644
--- a/core/reflect/reflect.odin
+++ b/core/reflect/reflect.odin
@@ -1279,10 +1279,12 @@ as_raw_data :: proc(a: any) -> (value: rawptr, valid: bool) {
 eq :: equal;
 ne :: not_equal;
 
-not_equal :: proc(a, b: any) -> bool {
-	return !equal(a, b);
+DEFAULT_EQUAL_MAX_RECURSION_LEVEL :: 32;
+
+not_equal :: proc(a, b: any, including_indirect_array_recursion := false, recursion_level := 0) -> bool {
+	return !equal(a, b, including_indirect_array_recursion, recursion_level);
 }
-equal :: proc(a, b: any) -> bool {
+equal :: proc(a, b: any, including_indirect_array_recursion := false, recursion_level := 0) -> bool {
 	if a == nil && b == nil {
 		return true;
 	}
@@ -1307,6 +1309,11 @@ equal :: proc(a, b: any) -> bool {
 	if .Simple_Compare in t.flags {
 		return mem.compare_byte_ptrs((^byte)(a.data), (^byte)(b.data), t.size) == 0;
 	}
+	
+	including_indirect_array_recursion := including_indirect_array_recursion;
+	if recursion_level >= DEFAULT_EQUAL_MAX_RECURSION_LEVEL {
+		including_indirect_array_recursion = false;
+	} 
 
 	t = runtime.type_info_core(t);
 
@@ -1321,23 +1328,25 @@ equal :: proc(a, b: any) -> bool {
 			y := (^string)(b.data)^;
 			return x == y;
 		}
-
+		return true;
 	case Type_Info_Array:
 		for i in 0..<v.count {
 			x := rawptr(uintptr(a.data) + uintptr(v.elem_size*i));
 			y := rawptr(uintptr(b.data) + uintptr(v.elem_size*i));
-			if !equal(any{x, v.elem.id}, any{y, v.elem.id}) {
+			if !equal(any{x, v.elem.id}, any{y, v.elem.id}, including_indirect_array_recursion, recursion_level) {
 				return false;
 			}
 		}
+		return true;
 	case Type_Info_Enumerated_Array:
 		for i in 0..<v.count {
 			x := rawptr(uintptr(a.data) + uintptr(v.elem_size*i));
 			y := rawptr(uintptr(b.data) + uintptr(v.elem_size*i));
-			if !equal(any{x, v.elem.id}, any{y, v.elem.id}) {
+			if !equal(any{x, v.elem.id}, any{y, v.elem.id}, including_indirect_array_recursion, recursion_level) {
 				return false;
 			}
 		}
+		return true;
 	case Type_Info_Struct:
 		if v.equal != nil {
 			return v.equal(a.data, b.data);
@@ -1346,11 +1355,57 @@ equal :: proc(a, b: any) -> bool {
 				x := rawptr(uintptr(a.data) + offset);
 				y := rawptr(uintptr(b.data) + offset);
 				id := v.types[i].id;
-				if !equal(any{x, id}, any{y, id}) {
+				if !equal(any{x, id}, any{y, id}, including_indirect_array_recursion, recursion_level) {
 					return false;
 				}
 			}
+			return true;
 		}
+	case Type_Info_Union:
+		if v.equal != nil {
+			return v.equal(a.data, b.data);
+		}
+		return false;
+	case Type_Info_Slice:
+		if !including_indirect_array_recursion {
+			break;
+		}
+		array_a := (^mem.Raw_Slice)(a.data);
+		array_b := (^mem.Raw_Slice)(b.data);
+		if array_a.len != array_b.len {
+			return false;
+		}
+		if array_a.data == array_b.data {
+			return true;
+		}
+		for i in 0..<array_a.len {
+			x := rawptr(uintptr(array_a.data) + uintptr(v.elem_size*i));
+			y := rawptr(uintptr(array_b.data) + uintptr(v.elem_size*i));
+			if !equal(any{x, v.elem.id}, any{y, v.elem.id}, including_indirect_array_recursion, recursion_level+1) {
+				return false;
+			}	
+		}
+		return true;
+	case Type_Info_Dynamic_Array:
+		if !including_indirect_array_recursion {
+			break;
+		}
+		array_a := (^mem.Raw_Dynamic_Array)(a.data);
+		array_b := (^mem.Raw_Dynamic_Array)(b.data);
+		if array_a.len != array_b.len {
+			return false;
+		}
+		if array_a.data == array_b.data {
+			return true;
+		}
+		for i in 0..<array_a.len {
+			x := rawptr(uintptr(array_a.data) + uintptr(v.elem_size*i));
+			y := rawptr(uintptr(array_b.data) + uintptr(v.elem_size*i));
+			if !equal(any{x, v.elem.id}, any{y, v.elem.id}, including_indirect_array_recursion, recursion_level+1) {
+				return false;
+			}	
+		}
+		return true;
 	}
 
 	return true;

From 70e12f7a1c58adb01875cd972731acebefbdb918 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Fri, 3 Sep 2021 14:48:16 +0200
Subject: [PATCH 21/33] big: Fix internal_int_mod for inputs with opposite
 signs.

This threw off Frobenius-Underwood.
---
 core/math/big/build.bat     |  6 +++---
 core/math/big/example.odin  | 14 ++++++++++----
 core/math/big/internal.odin | 26 ++++++++++++++++++++++----
 core/math/big/prime.odin    |  4 ++--
 4 files changed, 37 insertions(+), 13 deletions(-)

diff --git a/core/math/big/build.bat b/core/math/big/build.bat
index 43ece1054..47e940888 100644
--- a/core/math/big/build.bat
+++ b/core/math/big/build.bat
@@ -1,10 +1,10 @@
 @echo off
-odin run . -vet -define:MATH_BIG_USE_FROBENIUS_TEST=true
+:odin run . -vet -define:MATH_BIG_USE_FROBENIUS_TEST=true
 
 set TEST_ARGS=-fast-tests
-:set TEST_ARGS=
+set TEST_ARGS=
 :odin build . -build-mode:shared -show-timings -o:minimal -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
-:odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
+odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:size -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:speed -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:speed -define:MATH_BIG_EXE=false && python test.py -fast-tests %TEST_ARGS%
\ No newline at end of file
diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index fb1e51053..9c5fd6bc7 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -84,16 +84,20 @@ print :: proc(name: string, a: ^Int, base := i8(10), print_name := true, newline
 	}
 }
 
-printf :: fmt.printf;
+//printf :: fmt.printf;
 
 demo :: proc() {
 	a, b, c, d, e, f, res := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
 	defer destroy(a, b, c, d, e, f, res);
 
-	err: Error;
+	err:  Error;
+	frob:  bool;
 	prime: bool;
 
-	set(a, "3317044064679887385961981"); // Composite: 1287836182261 × 2575672364521
+	// USE_MILLER_RABIN_ONLY = true;
+
+	// set(a, "3317044064679887385961979"); // Composite: 17 × 1709 × 1366183751 × 83570142193
+	set(a, "359334085968622831041960188598043661065388726959079837"); // 6th Bell prime
 	trials := number_of_rabin_miller_trials(internal_count_bits(a));
 	{
 		SCOPED_TIMING(.is_prime);
@@ -101,7 +105,9 @@ demo :: proc() {
 	}
 	print("Candidate prime: ", a);
 	fmt.printf("%v Miller-Rabin trials needed.\n", trials);
-	fmt.printf("Is prime: %v, Error: %v\n", prime, err);
+
+	frob, err = internal_int_prime_frobenius_underwood(a);
+	fmt.printf("Frobenius-Underwood: %v, Prime: %v, Error: %v\n", frob, prime, err);
 }
 
 main :: proc() {
diff --git a/core/math/big/internal.odin b/core/math/big/internal.odin
index 6ae2f4284..81cb325d7 100644
--- a/core/math/big/internal.odin
+++ b/core/math/big/internal.odin
@@ -854,7 +854,7 @@ internal_int_mod :: proc(remainder, numerator, denominator: ^Int, allocator := c
 
 	if remainder.used == 0 || denominator.sign == remainder.sign { return nil; }
 
-	return #force_inline internal_add(remainder, remainder, numerator, allocator);
+	return #force_inline internal_add(remainder, remainder, denominator, allocator);
 }
 
 internal_int_mod_digit :: proc(numerator: ^Int, denominator: DIGIT, allocator := context.allocator) -> (remainder: DIGIT, err: Error) {
@@ -2747,9 +2747,27 @@ internal_int_count_lsb :: proc(a: ^Int) -> (count: int, err: Error) {
 	x: int;
 	#no_bounds_check for x = 0; x < a.used && a.digit[x] == 0; x += 1 {}
 
-	q := a.digit[x];
-	x *= _DIGIT_BITS;
-	x += internal_count_lsb(q);
+	when true {
+		q := a.digit[x];
+		x *= _DIGIT_BITS;
+		x += internal_count_lsb(q);
+	} else {
+		lnz := []int{
+   			4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+		};
+
+		q := a.digit[x];
+		x *= _DIGIT_BITS;
+		if q & 1 == 0 {
+			p: DIGIT;
+			for {
+				p = q & 15;
+				x += lnz[p];
+				q >>= 4;
+				if p != 0 { break; }
+			}
+		}		
+	}
 	return x, nil;
 }
 
diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index 48c72de0d..316a9de29 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -185,14 +185,14 @@ internal_int_kronecker :: proc(a, p: ^Int, allocator := context.allocator) -> (k
 				a1.digit[0] + 1 cannot overflow because the MSB
 				of the DIGIT type is not set by definition.
 			 */
-			if a1.digit[0] + 1 & p1.digit[0] & 2 != 0 {
+			if ((a1.digit[0] + 1) & p1.digit[0] & 2) != 0 {
 				k = -k;
 			}
 		} else {
 			/*
 				Compute k = (-1)^((a1-1)*(p1-1)/4) * k.
 			*/
-			if a1.digit[0] & p1.digit[0] & 2 != 0 {
+			if (a1.digit[0] & p1.digit[0] & 2) != 0 {
 				k = -k;
 			}
 		}

From c9f4fdc8566a633fc2b279bc183319bfb7986567 Mon Sep 17 00:00:00 2001
From: gingerBill <bill@gingerbill.org>
Date: Fri, 3 Sep 2021 15:49:29 +0100
Subject: [PATCH 22/33] Update reflect.equal to support more types

---
 core/reflect/reflect.odin | 53 +++++++++++++++++++++++++++++++--------
 1 file changed, 43 insertions(+), 10 deletions(-)

diff --git a/core/reflect/reflect.odin b/core/reflect/reflect.odin
index 6e2748e88..db770b891 100644
--- a/core/reflect/reflect.odin
+++ b/core/reflect/reflect.odin
@@ -1296,9 +1296,14 @@ equal :: proc(a, b: any, including_indirect_array_recursion := false, recursion_
 	if a.data == b.data {
 		return true;
 	}
+	
+	including_indirect_array_recursion := including_indirect_array_recursion;
+	if recursion_level >= DEFAULT_EQUAL_MAX_RECURSION_LEVEL {
+		including_indirect_array_recursion = false;
+	} 
 
 	t := type_info_of(a.id);
-	if .Comparable not_in t.flags {
+	if .Comparable not_in t.flags && !including_indirect_array_recursion {
 		return false;
 	}
 
@@ -1310,14 +1315,36 @@ equal :: proc(a, b: any, including_indirect_array_recursion := false, recursion_
 		return mem.compare_byte_ptrs((^byte)(a.data), (^byte)(b.data), t.size) == 0;
 	}
 	
-	including_indirect_array_recursion := including_indirect_array_recursion;
-	if recursion_level >= DEFAULT_EQUAL_MAX_RECURSION_LEVEL {
-		including_indirect_array_recursion = false;
-	} 
-
 	t = runtime.type_info_core(t);
 
-	#partial switch v in t.variant {
+	switch v in t.variant {
+	case Type_Info_Named:
+		unreachable();
+	case Type_Info_Any:
+		return false;
+	case Type_Info_Tuple:
+		unreachable();
+	case Type_Info_Map:
+		return false;
+	case Type_Info_Relative_Slice:
+		return false;
+	case 
+		Type_Info_Boolean,
+		Type_Info_Integer, 
+		Type_Info_Rune,
+		Type_Info_Float,
+		Type_Info_Complex,
+		Type_Info_Quaternion,
+		Type_Info_Type_Id,
+		Type_Info_Pointer,
+		Type_Info_Multi_Pointer,
+		Type_Info_Procedure,
+		Type_Info_Bit_Set,
+		Type_Info_Enum,
+		Type_Info_Simd_Vector,
+		Type_Info_Relative_Pointer:
+		return mem.compare_byte_ptrs((^byte)(a.data), (^byte)(b.data), t.size) == 0;
+		
 	case Type_Info_String:
 		if v.is_cstring {
 			x := string((^cstring)(a.data)^);
@@ -1368,7 +1395,7 @@ equal :: proc(a, b: any, including_indirect_array_recursion := false, recursion_
 		return false;
 	case Type_Info_Slice:
 		if !including_indirect_array_recursion {
-			break;
+			return false;
 		}
 		array_a := (^mem.Raw_Slice)(a.data);
 		array_b := (^mem.Raw_Slice)(b.data);
@@ -1388,7 +1415,7 @@ equal :: proc(a, b: any, including_indirect_array_recursion := false, recursion_
 		return true;
 	case Type_Info_Dynamic_Array:
 		if !including_indirect_array_recursion {
-			break;
+			return false;
 		}
 		array_a := (^mem.Raw_Dynamic_Array)(a.data);
 		array_b := (^mem.Raw_Dynamic_Array)(b.data);
@@ -1398,6 +1425,10 @@ equal :: proc(a, b: any, including_indirect_array_recursion := false, recursion_
 		if array_a.data == array_b.data {
 			return true;
 		}
+		if .Simple_Compare in v.elem.flags {
+			return mem.compare_byte_ptrs((^byte)(array_a.data), (^byte)(array_b.data), array_a.len * v.elem.size) == 0;
+		}
+		
 		for i in 0..<array_a.len {
 			x := rawptr(uintptr(array_a.data) + uintptr(v.elem_size*i));
 			y := rawptr(uintptr(array_b.data) + uintptr(v.elem_size*i));
@@ -1407,6 +1438,8 @@ equal :: proc(a, b: any, including_indirect_array_recursion := false, recursion_
 		}
 		return true;
 	}
-
+	
+	runtime.print_typeid(a.id);
+	runtime.print_string("\n");
 	return true;
 }

From e3809f5c1b10963bcdbcebe925f0d3a31c0ea893 Mon Sep 17 00:00:00 2001
From: gingerBill <bill@gingerbill.org>
Date: Fri, 3 Sep 2021 15:52:47 +0100
Subject: [PATCH 23/33] Allow comparisons of `any` if `reflect.equal` if
 `including_indirect_array_recursion` is enabled

---
 core/reflect/reflect.odin | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/core/reflect/reflect.odin b/core/reflect/reflect.odin
index db770b891..929a25c9a 100644
--- a/core/reflect/reflect.odin
+++ b/core/reflect/reflect.odin
@@ -1320,10 +1320,15 @@ equal :: proc(a, b: any, including_indirect_array_recursion := false, recursion_
 	switch v in t.variant {
 	case Type_Info_Named:
 		unreachable();
-	case Type_Info_Any:
-		return false;
 	case Type_Info_Tuple:
 		unreachable();
+	case Type_Info_Any:
+		if !including_indirect_array_recursion {
+			return false;
+		}
+		va := (^any)(a.data);
+		vb := (^any)(b.data);
+		return equal(va, vb, including_indirect_array_recursion, recursion_level+1); 
 	case Type_Info_Map:
 		return false;
 	case Type_Info_Relative_Slice:

From b1ed7fc6b9bb16540a76d8edf286415f641ae120 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Fri, 3 Sep 2021 23:41:14 +0200
Subject: [PATCH 24/33] big: Add Lucas-Selfridge.

---
 core/math/big/build.bat     |   7 +-
 core/math/big/example.odin  |  10 +-
 core/math/big/internal.odin |  29 ++++-
 core/math/big/prime.odin    | 244 ++++++++++++++++++++++++++++++++++--
 4 files changed, 274 insertions(+), 16 deletions(-)

diff --git a/core/math/big/build.bat b/core/math/big/build.bat
index 47e940888..7ca32641b 100644
--- a/core/math/big/build.bat
+++ b/core/math/big/build.bat
@@ -1,10 +1,11 @@
 @echo off
-:odin run . -vet -define:MATH_BIG_USE_FROBENIUS_TEST=true
+odin run . -vet
+: -define:MATH_BIG_USE_FROBENIUS_TEST=true
 
 set TEST_ARGS=-fast-tests
-set TEST_ARGS=
+:set TEST_ARGS=
 :odin build . -build-mode:shared -show-timings -o:minimal -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
-odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
+:odin build . -build-mode:shared -show-timings -o:size -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:size -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:speed -no-bounds-check -define:MATH_BIG_EXE=false && python test.py %TEST_ARGS%
 :odin build . -build-mode:shared -show-timings -o:speed -define:MATH_BIG_EXE=false && python test.py -fast-tests %TEST_ARGS%
\ No newline at end of file
diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index 9c5fd6bc7..e324d5e29 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -84,14 +84,14 @@ print :: proc(name: string, a: ^Int, base := i8(10), print_name := true, newline
 	}
 }
 
-//printf :: fmt.printf;
+// printf :: fmt.printf;
 
 demo :: proc() {
 	a, b, c, d, e, f, res := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
 	defer destroy(a, b, c, d, e, f, res);
 
 	err:  Error;
-	frob:  bool;
+	lucas: bool;
 	prime: bool;
 
 	// USE_MILLER_RABIN_ONLY = true;
@@ -103,11 +103,11 @@ demo :: proc() {
 		SCOPED_TIMING(.is_prime);
 		prime, err = internal_int_is_prime(a, trials);
 	}
-	print("Candidate prime: ", a);
+	print("Candidate prime: ", a, 10, true, true, true);
 	fmt.printf("%v Miller-Rabin trials needed.\n", trials);
 
-	frob, err = internal_int_prime_frobenius_underwood(a);
-	fmt.printf("Frobenius-Underwood: %v, Prime: %v, Error: %v\n", frob, prime, err);
+	// lucas, err = internal_int_prime_strong_lucas_selfridge(a);
+	fmt.printf("Lucas-Selfridge: %v, Prime: %v, Error: %v\n", lucas, prime, err);
 }
 
 main :: proc() {
diff --git a/core/math/big/internal.odin b/core/math/big/internal.odin
index 81cb325d7..5b09e97e2 100644
--- a/core/math/big/internal.odin
+++ b/core/math/big/internal.odin
@@ -544,6 +544,25 @@ internal_int_shl1 :: proc(dest, src: ^Int, allocator := context.allocator) -> (e
 	return internal_clamp(dest);
 }
 
+/*
+	Multiply bigint `a` with int `d` and put the result in `dest`.
+ 	Like `internal_int_mul_digit` but with an integer as the small input.
+*/
+internal_int_mul_integer :: proc(dest, a: ^Int, b: $T, allocator := context.allocator) -> (err: Error)
+where intrinsics.type_is_integer(T) && T != DIGIT {
+	context.allocator = allocator;
+
+	t := &Int{};
+	defer internal_destroy(t);
+
+	/*
+		DIGIT might be smaller than a long, which excludes the use of `internal_int_mul_digit` here.
+	*/
+	internal_set(t, b) or_return;
+	internal_mul(dest, a, t) or_return;
+	return;
+}
+
 /*
 	Multiply by a DIGIT.
 */
@@ -697,7 +716,7 @@ internal_int_mul :: proc(dest, src, multiplier: ^Int, allocator := context.alloc
 	return err;
 }
 
-internal_mul :: proc { internal_int_mul, internal_int_mul_digit, };
+internal_mul :: proc { internal_int_mul, internal_int_mul_digit, internal_int_mul_integer };
 
 internal_sqr :: proc (dest, src: ^Int, allocator := context.allocator) -> (res: Error) {
 	/*
@@ -940,6 +959,14 @@ internal_int_gcd_lcm :: proc(res_gcd, res_lcm, a, b: ^Int, allocator := context.
 	return #force_inline _private_int_gcd_lcm(res_gcd, res_lcm, a, b, allocator);
 }
 
+internal_int_gcd :: proc(res_gcd, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
+	return #force_inline _private_int_gcd_lcm(res_gcd, nil, a, b, allocator);
+}
+
+internal_int_lcm :: proc(res_lcm, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
+	return #force_inline _private_int_gcd_lcm(nil, res_lcm, a, b, allocator);
+}
+
 /*
 	remainder = numerator % (1 << bits)
 
diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index 316a9de29..0e3749273 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -368,12 +368,7 @@ internal_int_is_prime :: proc(a: ^Int, miller_rabin_trials := int(-1), miller_ra
 			when MATH_BIG_USE_FROBENIUS_TEST {
 				if !internal_int_prime_frobenius_underwood(a) or_return { return; }
 			} else {
-//				if ((err = mp_prime_strong_lucas_selfridge(a, &res)) != MP_OKAY) {
-//					goto LBL_B;
-//				}
-//				if (!res) {
-//					goto LBL_B;
-//				}
+				if !internal_int_prime_strong_lucas_selfridge(a) or_return { return; }
 			}
 		}
 	}
@@ -540,7 +535,7 @@ internal_int_prime_frobenius_underwood :: proc(N: ^Int, allocator := context.all
 
 	// Composite if N and (a+4)*(2*a+5) are not coprime.
 	internal_set(T1z, u32((a + 4) * ((2 * a) + 5)));
-	internal_int_gcd_lcm(T1z, nil, T1z, N) or_return;
+	internal_int_gcd(T1z, T1z, N) or_return;
 
 	if !(T1z.used == 1 && T1z.digit[0] == 1) {
 		// Composite.
@@ -597,6 +592,241 @@ internal_int_prime_frobenius_underwood :: proc(N: ^Int, allocator := context.all
 	return;
 }
 
+
+/*
+	Strong Lucas-Selfridge test.
+	returns true if it is a strong L-S prime, false if it is composite
+
+	Code ported from Thomas Ray Nicely's implementation of the BPSW test at http://www.trnicely.net/misc/bpsw.html
+
+	Freeware copyright (C) 2016 Thomas R. Nicely <http://www.trnicely.net>.
+	Released into the public domain by the author, who disclaims any legal liability arising from its use.
+
+	The multi-line comments are made by Thomas R. Nicely and are copied verbatim.
+	(If that name sounds familiar, he is the guy who found the fdiv bug in the Pentium CPU.)
+*/
+internal_int_prime_strong_lucas_selfridge :: proc(a: ^Int, allocator := context.allocator) -> (lucas_selfridge: bool, err: Error) {
+	// TODO: choose better variable names!
+
+	Dz, gcd, Np1, Uz, Vz, U2mz, V2mz, Qmz, Q2mz, Qkdz, T1z, T2z, T3z, T4z, Q2kdz := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
+	defer internal_destroy(Dz, gcd, Np1, Uz, Vz, U2mz, V2mz, Qmz, Q2mz, Qkdz, T1z, T2z, T3z, T4z, Q2kdz);
+
+	/*
+		Find the first element D in the sequence {5, -7, 9, -11, 13, ...}
+		such that Jacobi(D,N) = -1 (Selfridge's algorithm). Theory
+		indicates that, if N is not a perfect square, D will "nearly
+		always" be "small." Just in case, an overflow trap for D is	included.
+	*/
+	internal_init_multi(Dz, gcd, Np1, Uz, Vz, U2mz, V2mz, Qmz, Q2mz, Qkdz, T1z, T2z, T3z, T4z, Q2kdz) or_return;
+
+	D    := 5;
+	sign := 1;
+	Ds   : int;
+
+	for {
+		Ds   = sign * D;
+		sign = -sign;
+
+		internal_set(Dz, D) or_return;
+		internal_int_gcd(gcd, a, Dz) or_return;
+
+		/*
+			If 1 < GCD < `N` then `N` is composite with factor "D", and
+			Jacobi(D, N) is technically undefined (but often returned as zero).
+		*/
+		if internal_gt(gcd, 1) && internal_lt(gcd, a)    { return; }
+		if Ds < 0 { Dz.sign = .Negative; }
+
+		j := internal_int_kronecker(Dz, a) or_return;
+		if j == -1 { break; }
+
+		D += 2;
+		if D > max(int) - 2                              { return false, .Invalid_Argument; }
+	}
+
+	Q := (1 - Ds) / 4;   /* Required so D = P*P - 4*Q */
+
+	/*
+		NOTE: The conditions (a) N does not divide Q, and
+		(b) D is square-free or not a perfect square, are included by
+		some authors; e.g., "Prime numbers and computer methods for
+		factorization," Hans Riesel (2nd ed., 1994, Birkhauser, Boston),
+		p. 130. For this particular application of Lucas sequences,
+		these conditions were found to be immaterial.
+	*/
+
+	/*
+		Now calculate N - Jacobi(D,N) = N + 1 (even), and calculate the
+		odd positive integer d and positive integer s for which
+		N + 1 = 2^s*d (similar to the step for N - 1 in Miller's test).
+		The strong Lucas-Selfridge test then returns N as a strong
+		Lucas probable prime (slprp) if any of the following
+		conditions is met: U_d=0, V_d=0, V_2d=0, V_4d=0, V_8d=0,
+		V_16d=0, ..., etc., ending with V_{2^(s-1)*d}=V_{(N+1)/2}=0
+		(all equalities mod N). Thus d is the highest index of U that
+		must be computed (since V_2m is independent of U), compared
+		to U_{N+1} for the standard Lucas-Selfridge test; and no
+		index of V beyond (N+1)/2 is required, just as in the
+		standard Lucas-Selfridge test. However, the quantity Q^d must
+		be computed for use (if necessary) in the latter stages of
+		the test. The result is that the strong Lucas-Selfridge test
+		has a running time only slightly greater (order of 10 %) than
+		that of the standard Lucas-Selfridge test, while producing
+		only (roughly) 30 % as many pseudoprimes (and every strong
+		Lucas pseudoprime is also a standard Lucas pseudoprime). Thus
+		the evidence indicates that the strong Lucas-Selfridge test is
+		more effective than the standard Lucas-Selfridge test, and a
+		Baillie-PSW test based on the strong Lucas-Selfridge test
+		should be more reliable.
+	*/
+	internal_add(Np1, a, 1) or_return;
+	s := internal_count_lsb(Np1) or_return;
+
+	/*
+		This should round towards zero because Thomas R. Nicely used GMP's mpz_tdiv_q_2exp()
+		and mp_div_2d() is equivalent. Additionally: dividing an even number by two does not produce
+		any leftovers.
+	*/
+	internal_int_shr(Dz, Np1, s) or_return;
+
+	/*
+		We must now compute U_d and V_d. Since d is odd, the accumulated
+		values U and V are initialized to U_1 and V_1 (if the target
+		index were even, U and V would be initialized instead to U_0=0
+		and V_0=2). The values of U_2m and V_2m are also initialized to
+		U_1 and V_1; the FOR loop calculates in succession U_2 and V_2,
+		U_4 and V_4, U_8 and V_8, etc. If the corresponding bits
+		(1, 2, 3, ...) of t are on (the zero bit having been accounted
+		for in the initialization of U and V), these values are then
+		combined with the previous totals for U and V, using the
+		composition formulas for addition of indices.
+	*/
+	internal_set(Uz,   1) or_return;
+	internal_set(Vz,   1) or_return; //	P := 1; /* Selfridge's choice */
+	internal_set(U2mz, 1) or_return;
+	internal_set(V2mz, 1) or_return; //	P := 1; /* Selfridge's choice */
+	internal_set(Qmz,  Q) or_return;
+
+	internal_int_shl1(Q2mz, Qmz) or_return;
+
+	/*
+		Initializes calculation of Q^d.
+	*/
+	internal_set(Qkdz, Q) or_return;
+	Nbits := internal_count_bits(Dz);
+
+	for u := 1; u < Nbits; u += 1 { /* zero bit off, already accounted for */
+		/*
+			Formulas for doubling of indices (carried out mod N). Note that
+			the indices denoted as "2m" are actually powers of 2, specifically
+			2^(ul-1) beginning each loop and 2^ul ending each loop.
+			U_2m = U_m*V_m
+			V_2m = V_m*V_m - 2*Q^m
+		*/
+		internal_mul(U2mz, U2mz, V2mz) or_return;
+		internal_mod(U2mz, U2mz, a) or_return;
+		internal_sqr(V2mz, V2mz) or_return;
+		internal_sub(V2mz, V2mz, Q2mz) or_return;
+		internal_mod(V2mz, V2mz, a) or_return;
+
+		/*
+			Must calculate powers of Q for use in V_2m, also for Q^d later.
+		*/
+		internal_sqr(Qmz, Qmz) or_return;
+
+		/* Prevents overflow. Still necessary without a fixed prealloc'd mem.? */
+		internal_mod(Qmz, Qmz, a) or_return;
+		internal_int_shl1(Q2mz, Qmz) or_return;
+
+		if internal_int_bitfield_extract_bool(Dz, u) or_return {
+			/*
+				Formulas for addition of indices (carried out mod N);
+				U_(m+n) = (U_m*V_n + U_n*V_m)/2
+				V_(m+n) = (V_m*V_n + D*U_m*U_n)/2
+				Be careful with division by 2 (mod N)!
+			*/
+			internal_mul(T1z, U2mz, Vz) or_return;
+			internal_mul(T2z, Uz, V2mz) or_return;
+			internal_mul(T3z, V2mz, Vz) or_return;
+			internal_mul(T4z, U2mz, Uz) or_return;
+			internal_mul(T4z, T4z,  Ds) or_return;
+
+			internal_add(Uz,  T1z, T2z) or_return;
+
+			if internal_is_odd(Uz) {
+				internal_add(Uz, Uz, a) or_return;
+			}
+
+			/*
+				This should round towards negative infinity because Thomas R. Nicely used GMP's mpz_fdiv_q_2exp().
+				But `internal_shr1` does not do so, it is truncating instead.
+			*/
+			oddness := internal_is_odd(Uz);
+			internal_int_shr1(Uz, Uz) or_return;
+			if internal_is_negative(Uz) && oddness {
+				internal_sub(Uz, Uz, 1) or_return;
+			}
+			internal_add(Vz, T3z, T4z) or_return;
+			if internal_is_odd(Vz) {
+				internal_add(Vz, Vz, a) or_return;
+			}
+
+			oddness  = internal_is_odd(Vz);
+			internal_int_shr1(Vz, Vz) or_return;
+			if internal_is_negative(Vz) && oddness {
+				internal_sub(Vz, Vz, 1) or_return;
+			}
+			internal_mod(Uz, Uz, a) or_return;
+			internal_mod(Vz, Vz, a) or_return;
+
+			/* Calculating Q^d for later use */
+			internal_mul(Qkdz, Qkdz, Qmz) or_return;
+			internal_mod(Qkdz, Qkdz, a) or_return;
+		}
+	}
+
+	/*
+		If U_d or V_d is congruent to 0 mod N, then N is a prime or a strong Lucas pseudoprime. */
+	if internal_is_zero(Uz) || internal_is_zero(Vz) {
+		return true, nil;
+	}
+
+	/*
+		NOTE: Ribenboim ("The new book of prime number records," 3rd ed.,
+		1995/6) omits the condition V0 on p.142, but includes it on
+		p. 130. The condition is NECESSARY; otherwise the test will
+		return false negatives---e.g., the primes 29 and 2000029 will be
+		returned as composite.
+	*/
+
+	/*
+		Otherwise, we must compute V_2d, V_4d, V_8d, ..., V_{2^(s-1)*d}
+		by repeated use of the formula V_2m = V_m*V_m - 2*Q^m. If any of
+		these are congruent to 0 mod N, then N is a prime or a strong
+		Lucas pseudoprime.
+	*/
+
+	/* Initialize 2*Q^(d*2^r) for V_2m */
+	internal_int_shr1(Q2kdz, Qkdz) or_return;
+
+	for r := 1; r < s; r += 1 {
+		internal_sqr(Vz, Vz) or_return;
+		internal_sub(Vz, Vz, Q2kdz) or_return;
+		internal_mod(Vz, Vz, a) or_return;
+		if internal_is_zero(Vz) {
+			return true, nil;
+		}
+		/* Calculate Q^{d*2^r} for next r (final iteration irrelevant). */
+		if r < (s - 1) {
+			internal_sqr(Qkdz, Qkdz) or_return;
+			internal_mod(Qkdz, Qkdz, a) or_return;
+			internal_int_shl1(Q2kdz, Qkdz) or_return;
+		}
+	}
+	return false, nil;
+}
+
+
 /*
 	Returns the number of Rabin-Miller trials needed for a given bit size.
 */

From 52da5b8724df577daacde7d705257c722422cd19 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Fri, 3 Sep 2021 23:53:32 +0200
Subject: [PATCH 25/33] big: Default to Frobenius-Underwood. It's 10% faster
 than Lucas-Selfridge.

---
 core/math/big/common.odin | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/core/math/big/common.odin b/core/math/big/common.odin
index 4d8224cd6..243e74d0c 100644
--- a/core/math/big/common.odin
+++ b/core/math/big/common.odin
@@ -80,7 +80,8 @@ FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS := 100;
 
 	Use Frobenius-Underwood for primality testing, or use Lucas-Selfridge (default).
 */
-MATH_BIG_USE_FROBENIUS_TEST :: #config(MATH_BIG_USE_FROBENIUS_TEST, false);
+MATH_BIG_USE_LUCAS_SELFRIDGE_TEST :: #config(MATH_BIG_USE_LUCAS_SELFRIDGE_TEST, false);
+MATH_BIG_USE_FROBENIUS_TEST       :: !MATH_BIG_USE_LUCAS_SELFRIDGE_TEST;
 
 /*
 	Runtime tunable to use Miller-Rabin primality testing only and skip the above.

From f2c5c26f2c4c192038ee25017b329964e8b45492 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Sat, 4 Sep 2021 16:31:05 +0200
Subject: [PATCH 26/33] big: Add `internal_int_prime_next_prime`.

---
 core/math/big/example.odin  |  26 +--
 core/math/big/internal.odin |   2 +-
 core/math/big/prime.odin    | 344 +++++++++++++++++++++++++++++++++++-
 core/math/big/private.odin  |   5 +-
 4 files changed, 354 insertions(+), 23 deletions(-)

diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index e324d5e29..252cff0bf 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -26,7 +26,7 @@ Configuration:
 	_WARRAY                               %v
 	_TAB_SIZE                             %v
 	_MAX_WIN_SIZE                         %v
-	MATH_BIG_USE_FROBENIUS_TEST           %v
+	MATH_BIG_USE_LUCAS_SELFRIDGE_TEST     %v
 Runtime tunable:
 	MUL_KARATSUBA_CUTOFF                  %v
 	SQR_KARATSUBA_CUTOFF                  %v
@@ -47,7 +47,7 @@ _MAX_COMBA,
 _WARRAY,
 _TAB_SIZE,
 _MAX_WIN_SIZE,
-MATH_BIG_USE_FROBENIUS_TEST,
+MATH_BIG_USE_LUCAS_SELFRIDGE_TEST,
 
 MUL_KARATSUBA_CUTOFF,
 SQR_KARATSUBA_CUTOFF,
@@ -90,24 +90,12 @@ demo :: proc() {
 	a, b, c, d, e, f, res := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
 	defer destroy(a, b, c, d, e, f, res);
 
-	err:  Error;
-	lucas: bool;
-	prime: bool;
-
-	// USE_MILLER_RABIN_ONLY = true;
-
-	// set(a, "3317044064679887385961979"); // Composite: 17 × 1709 × 1366183751 × 83570142193
-	set(a, "359334085968622831041960188598043661065388726959079837"); // 6th Bell prime
+	set(a, _private_prime_table[_PRIME_TAB_SIZE - 1]);
+	print("a: ", a);
 	trials := number_of_rabin_miller_trials(internal_count_bits(a));
-	{
-		SCOPED_TIMING(.is_prime);
-		prime, err = internal_int_is_prime(a, trials);
-	}
-	print("Candidate prime: ", a, 10, true, true, true);
-	fmt.printf("%v Miller-Rabin trials needed.\n", trials);
-
-	// lucas, err = internal_int_prime_strong_lucas_selfridge(a);
-	fmt.printf("Lucas-Selfridge: %v, Prime: %v, Error: %v\n", lucas, prime, err);
+	err := internal_int_prime_next_prime(a, trials, false);
+	print("a->next: ", a);
+	fmt.printf("Trials: %v, Error: %v\n", trials, err);
 }
 
 main :: proc() {
diff --git a/core/math/big/internal.odin b/core/math/big/internal.odin
index 5b09e97e2..5b7e84a87 100644
--- a/core/math/big/internal.odin
+++ b/core/math/big/internal.odin
@@ -880,7 +880,7 @@ internal_int_mod_digit :: proc(numerator: ^Int, denominator: DIGIT, allocator :=
 	return internal_int_divmod_digit(nil, numerator, denominator, allocator);
 }
 
-internal_mod :: proc{ internal_int_mod, internal_int_mod_digit};
+internal_mod :: proc{ internal_int_mod, internal_int_mod_digit, };
 
 /*
 	remainder = (number + addend) % modulus.
diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index 0e3749273..f2d360e3f 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -112,7 +112,7 @@ internal_int_exponent_mod :: proc(res, G, X, P: ^Int, allocator := context.alloc
 }
 
 /*
-	Kronecker symbol (a|p)
+	Kronecker/Legendre symbol (a|p)
 	Straightforward implementation of algorithm 1.4.10 in
 	Henri Cohen: "A Course in Computational Algebraic Number Theory"
 
@@ -205,6 +205,7 @@ internal_int_kronecker :: proc(a, p: ^Int, allocator := context.allocator) -> (k
 	}
 	return;
 }
+internal_int_legendre :: internal_int_kronecker;
 
 /*
 	Miller-Rabin test of "a" to the base of "b" as described in HAC pp. 139 Algorithm 4.24.
@@ -826,6 +827,347 @@ internal_int_prime_strong_lucas_selfridge :: proc(a: ^Int, allocator := context.
 	return false, nil;
 }
 
+/*
+	Performs one Fermat test.
+
+	If "a" were prime then b**a == b (mod a) since the order of
+	the multiplicative sub-group would be phi(a) = a-1.  That means
+	it would be the same as b**(a mod (a-1)) == b**1 == b (mod a).
+
+	Returns `true` if the congruence holds, or `false` otherwise.
+
+	Assumes `a` and `b` not to be `nil` and to have been initialized.
+*/
+internal_prime_fermat :: proc(a, b: ^Int, allocator := context.allocator) -> (fermat: bool, err: Error) {
+	t := &Int{};
+	defer internal_destroy(t);
+
+	/*
+		Ensure `b` > 1.
+	*/
+	if !internal_gt(b, 1) { return false, .Invalid_Argument; }
+
+	/*
+		Compute `t` = `b`**`a` mod `a`
+	*/
+	internal_int_exponent_mod(t, b, a, a) or_return;
+
+	/*
+		Is it equal to b?
+	*/
+	fermat = internal_eq(t, b);
+	return;
+}
+
+/*
+	Tonelli-Shanks algorithm
+	https://en.wikipedia.org/wiki/Tonelli%E2%80%93Shanks_algorithm
+	https://gmplib.org/list-archives/gmp-discuss/2013-April/005300.html
+*/
+internal_int_sqrtmod_prime :: proc(res, n, prime: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	/*
+		The type is "int" because of the types in the mp_int struct.
+		Don't forget to change them here when you change them there!
+	*/
+	S, M, i: int;
+
+	t1, C, Q, Z, T, R, two := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
+	defer internal_destroy(t1, C, Q, Z, T, R, two);
+
+	/*
+		First handle the simple cases.
+	*/
+	if internal_is_zero(n)                                           { return internal_zero(res);	}
+
+	/*
+		"prime" must be odd and > 2
+	*/
+	if internal_is_even(prime) || internal_lt(prime, 3)              { return .Invalid_Argument; }
+	legendre := internal_int_kronecker(n, prime)                     or_return;
+
+	/*
+		n \not\cong 0 (mod p) and n \cong r^2 (mod p) for some r \in N^+
+	*/
+	if legendre != 1                                                 { return .Invalid_Argument; }
+
+	internal_init_multi(t1, C, Q, Z, T, R, two)                      or_return;
+
+	/*
+		SPECIAL CASE: if prime mod 4 == 3
+		compute directly: err = n^(prime+1)/4 mod prime
+		Handbook of Applied Cryptography algorithm 3.36
+
+		x%4 == x&3 for x in N and x>0
+	*/
+	if prime.digit[0] & 3 == 3 {
+		internal_add(t1, prime, 1)                                   or_return;
+		internal_shr(t1, t1, 2)                                      or_return;
+		internal_int_exponent_mod(res, n, t1, prime)                 or_return;
+		return;
+	}
+
+	/*
+		NOW: Tonelli-Shanks algorithm
+		Factor out powers of 2 from prime-1, defining Q and S as: prime-1 = Q*2^S
+
+		Q = prime - 1
+	*/
+	internal_copy(Q, prime)                                          or_return;
+	internal_sub(Q, Q, 1)                                            or_return;
+
+	/*
+		S = 0
+	*/
+	S = 0;
+	for internal_is_even(Q) {
+		/*
+			Q = Q / 2
+		*/
+		internal_int_shr1(Q, Q)                                      or_return;
+		/*
+			S = S + 1
+		*/
+		S += 1;
+	}
+
+	/*
+		Find a `Z` such that the Legendre symbol (Z|prime) == -1.
+		Z = 2.
+	*/
+	internal_set(Z, 2)                                               or_return;
+
+	for {
+		legendre = internal_int_kronecker(Z, prime)                  or_return;
+
+		/*
+			If "prime" (p) is an odd prime Jacobi(k|p) = 0 for k \cong 0 (mod p)
+			but there is at least one non-quadratic residue before k>=p if p is an odd prime.
+		*/
+		if legendre == 0                                             { return .Invalid_Argument; }
+		if legendre == -1                                            { break; }
+
+		/*
+			Z = Z + 1
+		*/
+		internal_add(Z, Z, 1)                                        or_return;
+	}
+
+	/*
+		C = Z ^ Q mod prime
+	*/
+	internal_int_exponent_mod(C, Z, Q, prime)                        or_return;
+
+	/*
+		t1 = (Q + 1) / 2
+	*/
+	internal_add(t1, Q, 1)                                           or_return;
+	internal_int_shr1(t1, t1)                                        or_return;
+
+	/*
+		R = n ^ ((Q + 1) / 2) mod prime
+	*/
+	internal_int_exponent_mod(R, n, t1, prime)                       or_return;
+
+	/*
+		T = n ^ Q mod prime
+	*/
+	internal_int_exponent_mod(T, n, Q, prime)                        or_return;
+
+	/*
+		M = S
+	*/
+	M = S;
+	internal_set(two, 2);
+
+	for {
+		internal_copy(t1, T)                                         or_return;
+
+		i = 0;
+		for {
+			if internal_eq(T, 1)                                     { break; }
+
+			/*
+				No exponent in the range 0 < i < M found.
+				(M is at least 1 in the first round because "prime" > 2)
+			*/
+			if M == i                                                { return .Invalid_Argument; }
+			internal_int_exponent_mod(t1, t1, two, prime)            or_return;
+
+			i += 1;
+		}
+
+		if i == 0 {
+			internal_copy(res, R)                                    or_return;
+		}
+
+		/*
+			t1 = 2 ^ (M - i - 1)
+		*/
+		internal_set(t1, M - i - 1)                                  or_return;
+		internal_int_exponent_mod(t1, two, t1, prime)                or_return;
+
+		/*
+			t1 = C ^ (2 ^ (M - i - 1)) mod prime
+		*/
+		internal_int_exponent_mod(t1, C, t1, prime)                  or_return;
+
+		/*
+			C = (t1 * t1) mod prime
+		*/
+		internal_sqrmod(C, t1, prime)                                or_return;
+
+		/*
+			R = (R * t1) mod prime
+		*/
+		internal_mulmod(R, R, t1, prime)                             or_return;
+
+		/*
+			T = (T * C) mod prime
+		*/
+		mulmod(T, T, C, prime)                                       or_return;
+
+		/*
+			M = i
+		*/
+		M = i;
+	}
+
+	return;
+}
+
+/*
+	Finds the next prime after the number `a` using `t` trials of Miller-Rabin,
+	in place: It sets `a` to the prime found.
+	`bbs_style` = true means the prime must be congruent to 3 mod 4
+*/
+internal_int_prime_next_prime :: proc(a: ^Int, trials: int, bbs_style: bool, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	res_tab := [_PRIME_TAB_SIZE]DIGIT{};
+
+	/*
+		Force positive.
+	*/
+	a.sign = .Zero_or_Positive;
+
+	/*
+		Simple algo if `a` is less than the largest prime in the table.
+	*/
+	if internal_lt(a, _private_prime_table[_PRIME_TAB_SIZE - 1]) {
+		/*
+			Find which prime it is bigger than `a`
+		*/
+		for p in _private_prime_table {
+			cmp := internal_cmp(a, p);
+
+			if cmp == 0 { continue; }
+			if cmp != 1 {
+				if bbs_style && (p & 3 != 3) {
+					/*
+						Try again until we get a prime congruent to 3 mod 4.
+					*/
+					continue;
+				} else {
+					return internal_set(a, p);
+				}
+			}
+		}
+		/*
+			Fall through to the sieve.
+		*/
+	}
+
+	/*
+		Generate a prime congruent to 3 mod 4 or 1/3 mod 4?
+	*/
+	kstep: DIGIT = 4 if bbs_style else 2;
+
+	/*
+		At this point we will use a combination of a sieve and Miller-Rabin.
+	*/
+	if bbs_style {
+		/*
+			If `a` mod 4 != 3 subtract the correct value to make it so.
+		*/
+		if a.digit[0] & 3 != 3 {
+			internal_sub(a, a, (a.digit[0] & 3) + 1) or_return;
+		}
+	} else {
+		if internal_is_even(a) {
+			/*
+				Force odd.
+			*/
+			internal_sub(a, a, 1) or_return;
+		}
+	}
+
+	/*
+		Generate the restable.
+	*/
+	for x := 1; x < _PRIME_TAB_SIZE; x += 1 {
+		res_tab = internal_mod(a, _private_prime_table[x]) or_return;
+	}
+
+	for {
+		step := DIGIT(0);
+		y: bool;
+
+		/*
+			Skip to the next non-trivially divisible candidate.
+		*/
+		for {
+			/*
+				y == true if any residue was zero [e.g. cannot be prime]
+			*/
+			y = false;
+
+			/*
+				Increase step to next candidate.
+			*/
+			step += kstep;
+
+			/*
+				Compute the new residue without using division.
+			*/
+			for x := 1; x < _PRIME_TAB_SIZE; x += 1 {
+				/*
+					Add the step to each residue.
+				*/
+				res_tab[x] += kstep;
+
+				/*
+					Subtract the modulus [instead of using division].
+				*/
+				if res_tab[x] >= _private_prime_table[x] {
+					res_tab[x] -= _private_prime_table[x];
+				}
+
+				/*
+					Set flag if zero.
+				*/
+				if res_tab[x] == 0 {
+					y = true;
+				}
+			}
+			if !(y && (step < (((1 << _DIGIT_BITS) - kstep)))) { break; }
+		}
+
+		/*
+			Add the step.
+		*/
+		internal_add(a, a, step) or_return;
+
+		/*
+			If we didn't pass the sieve and step == MP_MAX then skip test */
+		if (y && (step >= ((1 << _DIGIT_BITS) - kstep))) { continue; }
+
+		if internal_int_is_prime(a, trials) or_return { break; }
+	}
+	return;
+}
+
 
 /*
 	Returns the number of Rabin-Miller trials needed for a given bit size.
diff --git a/core/math/big/private.odin b/core/math/big/private.odin
index 002dbda09..feffa1141 100644
--- a/core/math/big/private.odin
+++ b/core/math/big/private.odin
@@ -3223,7 +3223,8 @@ _private_int_rem_105 := [?]DIGIT{
 };
 #assert(105 * size_of(DIGIT) == size_of(_private_int_rem_105));
 
-_private_prime_table := [?]DIGIT{
+_PRIME_TAB_SIZE :: 256;
+_private_prime_table := [_PRIME_TAB_SIZE]DIGIT{
 	0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
 	0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
 	0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
@@ -3260,7 +3261,7 @@ _private_prime_table := [?]DIGIT{
 	0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
 	0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653,
 };
-#assert(256 * size_of(DIGIT) == size_of(_private_prime_table));
+#assert(_PRIME_TAB_SIZE * size_of(DIGIT) == size_of(_private_prime_table));
 
 when MATH_BIG_FORCE_64_BIT || (!MATH_BIG_FORCE_32_BIT && size_of(rawptr) == 8) {
 	_factorial_table := [35]_WORD{

From 1f5ce91ae21c975655a68d70714ab62434b914f6 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Sun, 5 Sep 2021 10:40:35 +0200
Subject: [PATCH 27/33] big: Add `internal_random_prime`.

---
 core/math/big/common.odin   |  17 +++++-
 core/math/big/example.odin  |  25 +++++---
 core/math/big/helpers.odin  |   6 +-
 core/math/big/internal.odin |  33 ++++++++++-
 core/math/big/prime.odin    | 115 +++++++++++++++++++++++++++++++++++-
 core/math/big/tune.odin     |   1 +
 6 files changed, 180 insertions(+), 17 deletions(-)

diff --git a/core/math/big/common.odin b/core/math/big/common.odin
index 243e74d0c..fb215cd69 100644
--- a/core/math/big/common.odin
+++ b/core/math/big/common.odin
@@ -88,6 +88,17 @@ MATH_BIG_USE_FROBENIUS_TEST       :: !MATH_BIG_USE_LUCAS_SELFRIDGE_TEST;
 */
 USE_MILLER_RABIN_ONLY       := false;
 
+/*
+	How many times we'll call `internal_int_random` during random prime generation before we bail out.
+	Set to 0 or less to try indefinitely.	
+*/
+MAX_ITERATIONS_RANDOM_PRIME  := 1_000_000;
+
+/*
+	How many iterations we used for the last random prime.
+*/
+@thread_local RANDOM_PRIME_ITERATIONS_USED: int;
+
 /*
 	We don't allow these to be switched at runtime for two reasons:
 
@@ -175,9 +186,9 @@ Error_String :: #partial [Error]string{
 };
 
 Primality_Flag :: enum u8 {
-	Blum_Blum_Shub = 0,	/* BBS style prime */
-	Safe           = 1,	/* Safe prime (p-1)/2 == prime */
-	Second_MSB_On  = 3,  /* force 2nd MSB to 1 */
+ 	Blum_Blum_Shub = 0,  // Make prime congruent to 3 mod 4
+	Safe           = 1,  // Make sure (p-1)/2 is prime as well (implies .Blum_Blum_Shub)
+	Second_MSB_On  = 3,  // Make the 2nd highest bit one
 };
 Primality_Flags :: bit_set[Primality_Flag; u8];
 
diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index 252cff0bf..45cfae778 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -37,6 +37,7 @@ Runtime tunable:
 	FACTORIAL_BINARY_SPLIT_CUTOFF         %v
 	FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS %v
 	USE_MILLER_RABIN_ONLY                 %v
+	MAX_ITERATIONS_RANDOM_PRIME           %v
 
 `, _DIGIT_BITS,
 _LOW_MEMORY,
@@ -58,6 +59,7 @@ FACTORIAL_MAX_N,
 FACTORIAL_BINARY_SPLIT_CUTOFF,
 FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS,
 USE_MILLER_RABIN_ONLY,
+MAX_ITERATIONS_RANDOM_PRIME,
 );
 
 }
@@ -84,18 +86,27 @@ print :: proc(name: string, a: ^Int, base := i8(10), print_name := true, newline
 	}
 }
 
-// printf :: fmt.printf;
+printf :: fmt.printf;
 
 demo :: proc() {
 	a, b, c, d, e, f, res := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
 	defer destroy(a, b, c, d, e, f, res);
 
-	set(a, _private_prime_table[_PRIME_TAB_SIZE - 1]);
-	print("a: ", a);
-	trials := number_of_rabin_miller_trials(internal_count_bits(a));
-	err := internal_int_prime_next_prime(a, trials, false);
-	print("a->next: ", a);
-	fmt.printf("Trials: %v, Error: %v\n", trials, err);
+	bits   := 111;
+	trials := -1;
+
+	flags := Primality_Flags{};
+	fmt.printf("Trying to generate a %v bit prime using %v Miller-Rabin trials and options %v.\n", bits, trials, flags);
+
+	err: Error;
+	{
+		SCOPED_TIMING(.random_prime);
+		err = internal_random_prime(a, bits, trials, flags);
+	}
+
+	print("a(10): ", a, 10, true, true, true);
+	fmt.printf("err: %v\n", err);
+	fmt.printf("RANDOM_PRIME_ITERATIONS_USED: %v\n", RANDOM_PRIME_ITERATIONS_USED);
 }
 
 main :: proc() {
diff --git a/core/math/big/helpers.odin b/core/math/big/helpers.odin
index ff654172c..ccd00615f 100644
--- a/core/math/big/helpers.odin
+++ b/core/math/big/helpers.odin
@@ -362,15 +362,15 @@ int_random_digit :: proc(r: ^rnd.Rand = nil) -> (res: DIGIT) {
 	return 0; // We shouldn't get here.
 }
 
-int_rand :: proc(dest: ^Int, bits: int, r: ^rnd.Rand = nil, allocator := context.allocator) -> (err: Error) {
+int_random :: proc(dest: ^Int, bits: int, r: ^rnd.Rand = nil, allocator := context.allocator) -> (err: Error) {
 	/*
 		Check that `a` is usable.
 	*/
 	assert_if_nil(dest);
-	return #force_inline internal_int_rand(dest, bits, r, allocator);
+	return #force_inline internal_int_random(dest, bits, r, allocator);
 
 }
-rand :: proc { int_rand, };
+random :: proc { int_random, };
 
 /*
 	Internal helpers.
diff --git a/core/math/big/internal.odin b/core/math/big/internal.odin
index 5b7e84a87..a0516be3d 100644
--- a/core/math/big/internal.odin
+++ b/core/math/big/internal.odin
@@ -2045,6 +2045,7 @@ internal_invmod :: proc{ internal_int_inverse_modulo, };
 
 /*
 	Helpers to extract values from the `Int`.
+	Offset is zero indexed.
 */
 internal_int_bitfield_extract_bool :: proc(a: ^Int, offset: int) -> (val: bool, err: Error) {
 	limb := offset / _DIGIT_BITS;
@@ -2115,6 +2116,34 @@ internal_int_bitfield_extract :: proc(a: ^Int, offset, count: int) -> (res: _WOR
 	return res, nil;
 }
 
+/*
+	Helpers to (un)set a bit in an Int.
+	Offset is zero indexed.
+*/
+internal_int_bitfield_set_single :: proc(a: ^Int, offset: int) -> (err: Error) {
+	limb := offset / _DIGIT_BITS;
+	if limb < 0 || limb >= a.used  { return .Invalid_Argument; }
+	i := DIGIT(1 << uint((offset % _DIGIT_BITS)));
+	a.digit[limb] |= i;
+	return;
+}
+
+internal_int_bitfield_unset_single :: proc(a: ^Int, offset: int) -> (err: Error) {
+	limb := offset / _DIGIT_BITS;
+	if limb < 0 || limb >= a.used  { return .Invalid_Argument; }
+	i := DIGIT(1 << uint((offset % _DIGIT_BITS)));
+	a.digit[limb] &= _MASK - i;
+	return;
+}
+
+internal_int_bitfield_toggle_single :: proc(a: ^Int, offset: int) -> (err: Error) {
+	limb := offset / _DIGIT_BITS;
+	if limb < 0 || limb >= a.used  { return .Invalid_Argument; }
+	i := DIGIT(1 << uint((offset % _DIGIT_BITS)));
+	a.digit[limb] ~= i;
+	return;
+}
+
 /*
 	Resize backing store.
 	We don't need to pass the allocator, because the storage itself stores it.
@@ -2817,7 +2846,7 @@ internal_int_random_digit :: proc(r: ^rnd.Rand = nil) -> (res: DIGIT) {
 	return 0; // We shouldn't get here.
 }
 
-internal_int_rand :: proc(dest: ^Int, bits: int, r: ^rnd.Rand = nil, allocator := context.allocator) -> (err: Error) {
+internal_int_random :: proc(dest: ^Int, bits: int, r: ^rnd.Rand = nil, allocator := context.allocator) -> (err: Error) {
 	context.allocator = allocator;
 
 	bits := bits;
@@ -2842,7 +2871,7 @@ internal_int_rand :: proc(dest: ^Int, bits: int, r: ^rnd.Rand = nil, allocator :
 	dest.used = digits;
 	return nil;
 }
-internal_rand :: proc { internal_int_rand, };
+internal_random :: proc { internal_int_random, };
 
 /*
 	Internal helpers.
diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index f2d360e3f..ae538281c 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -456,7 +456,7 @@ internal_int_is_prime :: proc(a: ^Int, miller_rabin_trials := int(-1), miller_ra
 		for ix := 0; ix < miller_rabin_trials; ix += 1 {
 
 			// rand() guarantees the first digit to be non-zero
-			internal_rand(b, _DIGIT_TYPE_BITS, r) or_return;
+			internal_random(b, _DIGIT_TYPE_BITS, r) or_return;
 
 			// Reduce digit before casting because DIGIT might be bigger than
 			// an unsigned int and "mask" on the other side is most probably not.
@@ -474,7 +474,7 @@ internal_int_is_prime :: proc(a: ^Int, miller_rabin_trials := int(-1), miller_ra
 				ix -= 1;
 				continue;
 			}
-			internal_rand(b, l) or_return;
+			internal_random(b, l) or_return;
 
 			// That number might got too big and the witness has to be smaller than "a"
 			l = internal_count_bits(b);
@@ -1168,6 +1168,117 @@ internal_int_prime_next_prime :: proc(a: ^Int, trials: int, bbs_style: bool, all
 	return;
 }
 
+/*
+	Makes a truly random prime of a given size (bits),
+
+	Flags are as follows:
+	 	Blum_Blum_Shub    - Make prime congruent to 3 mod 4
+		Safe              - Make sure (p-1)/2 is prime as well (implies .Blum_Blum_Shub)
+		Second_MSB_On     - Make the 2nd highest bit one
+
+	This is possibly the mother of all prime generation functions, muahahahahaha!
+*/
+internal_random_prime :: proc(a: ^Int, size_in_bits: int, trials: int, flags := Primality_Flags{}, r: ^rnd.Rand = nil, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+	flags  := flags;
+	trials := trials;
+
+	t := &Int{};
+	defer internal_destroy(t);
+
+	/*
+		Sanity check the input.
+	*/
+	if size_in_bits <= 1 || trials < -1                              { return .Invalid_Argument; }
+
+	/*
+		`.Safe` implies `.Blum_Blum_Shub`.
+	*/
+	if .Safe in flags {
+		if size_in_bits < 3 {
+			/*
+				The smallest safe prime is 5, which takes 3 bits.
+				We early out now, else we'd be locked in an infinite loop trying to generate a 2-bit Safe Prime.
+			*/
+			return .Invalid_Argument;
+		}
+		flags += { .Blum_Blum_Shub, };
+	}
+
+	/*
+		Automatically choose the number of Rabin-Miller trials?
+	*/
+	if trials == -1 {
+		trials = number_of_rabin_miller_trials(size_in_bits);
+	}
+
+	res: bool;
+	RANDOM_PRIME_ITERATIONS_USED = 0;
+
+	for {
+		if MAX_ITERATIONS_RANDOM_PRIME > 0 {
+			RANDOM_PRIME_ITERATIONS_USED += 1;
+			if RANDOM_PRIME_ITERATIONS_USED > MAX_ITERATIONS_RANDOM_PRIME {
+				return .Max_Iterations_Reached;
+			}
+		}
+
+		internal_int_random(a, size_in_bits)                         or_return;
+
+		/*
+			Make sure it's odd.
+		*/
+		if size_in_bits > 2 {
+			a.digit[0] |= 1;
+		} else {
+			/*
+				A 2-bit prime can be either 2 (0b10) or 3 (0b11).
+				So, let's force the top bit to 1 and return early.
+			*/
+			a.digit[0] |= 2;
+			return nil;
+		}
+
+		if .Blum_Blum_Shub in flags {
+			a.digit[0] |= 3;
+		}
+		if .Second_MSB_On in flags {
+			internal_int_bitfield_set_single(a, size_in_bits - 2) or_return;
+		}
+
+		/*
+			Is it prime?
+		*/
+		res = internal_int_is_prime(a, trials)                       or_return;
+
+		if (!res) {
+			continue;
+		}
+
+		if .Safe in flags {
+			/*
+				See if (a-1)/2 is prime.
+			*/
+			internal_sub(a, a, 1)                                    or_return;
+			internal_int_shr1(a, a)                                  or_return;
+
+			/*
+				Is it prime?
+			*/
+			res = internal_int_is_prime(a, trials)                   or_return;
+		}
+		if res { break; }
+	}
+
+	if .Safe in flags {
+		/*
+			Restore a to the original value.
+		*/
+		internal_int_shl1(a, a)                                      or_return;
+		internal_add(a, a, 1)                                        or_return;
+	}
+	return;
+}
 
 /*
 	Returns the number of Rabin-Miller trials needed for a given bit size.
diff --git a/core/math/big/tune.odin b/core/math/big/tune.odin
index ced8e5f5d..6ce5121b4 100644
--- a/core/math/big/tune.odin
+++ b/core/math/big/tune.odin
@@ -24,6 +24,7 @@ Category :: enum {
 	bitfield_extract,
 	rm_trials,
 	is_prime,
+	random_prime,
 };
 
 Event :: struct {

From f33d0725db8d4cc4a7e8e6f81da4a72fc34f5b15 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Sun, 5 Sep 2021 14:03:02 +0200
Subject: [PATCH 28/33] big: Add Extended Euclidean algorithm.

---
 core/math/big/example.odin |  4 +-
 core/math/big/prime.odin   | 83 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index 45cfae778..6b697d472 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -86,13 +86,13 @@ print :: proc(name: string, a: ^Int, base := i8(10), print_name := true, newline
 	}
 }
 
-printf :: fmt.printf;
+// printf :: fmt.printf;
 
 demo :: proc() {
 	a, b, c, d, e, f, res := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
 	defer destroy(a, b, c, d, e, f, res);
 
-	bits   := 111;
+	bits   := 64;
 	trials := -1;
 
 	flags := Primality_Flags{};
diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index ae538281c..c3e4822df 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -1280,6 +1280,89 @@ internal_random_prime :: proc(a: ^Int, size_in_bits: int, trials: int, flags :=
 	return;
 }
 
+/*
+	Extended Euclidean algorithm of (a, b) produces `a * u1` + `b * u2` = `u3`.
+*/
+internal_int_extended_euclidean :: proc(a, b, U1, U2, U3: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	u1, u2, u3, v1, v2, v3, t1, t2, t3, q, tmp := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
+	defer internal_destroy(u1, u2, u3, v1, v2, v3, t1, t2, t3, q, tmp);
+	internal_init_multi(u1, u2, u3, v1, v2, v3, t1, t2, t3, q, tmp)  or_return;
+
+	/*
+		Initialize, (u1, u2, u3) = (1, 0, a).
+	*/
+	internal_set(u1, 1)                                              or_return;
+	internal_set(u3, a)                                              or_return;
+
+	/*
+		Initialize, (v1, v2, v3) = (0, 1, b).
+	*/
+	internal_set(v2, 1)                                              or_return;
+	internal_set(v3, b)                                              or_return;
+
+	/*
+		Loop while v3 != 0
+	*/
+	for !internal_is_zero(v3) {
+		/*
+			q = u3 / v3
+		*/
+		internal_div(q, u3, v3)                                      or_return;
+
+		/*
+			(t1, t2, t3) = (u1, u2, u3) - (v1, v2, v3)q
+		*/
+		internal_mul(tmp, v1, q)                                     or_return;
+		internal_sub( t1, u1, tmp)                                   or_return;
+
+		internal_mul(tmp, v2, q)                                     or_return;
+		internal_sub( t2, u2, tmp)                                   or_return;
+
+		internal_mul(tmp, v3, q)                                     or_return;
+		internal_sub( t3, u3, tmp)                                   or_return;
+
+		/*
+			(u1, u2, u3) = (v1, v2, v3)
+		*/
+		internal_set(u1, v1)                                         or_return;
+		internal_set(u2, v2)                                         or_return;
+		internal_set(u3, v3)                                         or_return;
+
+		/*
+			(v1, v2, v3) = (t1, t2, t3)
+		*/
+		internal_set(v1, t1)                                         or_return;
+		internal_set(v2, t2)                                         or_return;
+		internal_set(v3, t3)                                         or_return;
+	}
+
+	/*
+		Make sure U3 >= 0.
+	*/
+	if internal_is_negative(u3) {
+		internal_neg(u1, u1)                                         or_return;
+		internal_neg(u2, u2)                                         or_return;
+		internal_neg(u3, u3)                                         or_return;
+	}
+
+	/*
+		Copy result out.
+	*/
+	if U1 != nil {
+		internal_swap(u1, U1);
+	}
+	if U2 != nil {
+		internal_swap(u2, U2);
+	}
+	if U3 != nil {
+		internal_swap(u3, U3);
+	}
+	return;
+}
+
+
 /*
 	Returns the number of Rabin-Miller trials needed for a given bit size.
 */

From 3faac14d62c2fb05e7ee9e207f29033897d4ae77 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Sun, 5 Sep 2021 15:50:23 +0200
Subject: [PATCH 29/33] big: Add ASCII file import/export.

---
 core/math/big/common.odin  |  4 ++
 core/math/big/example.odin | 14 ++++++-
 core/math/big/radix.odin   | 79 ++++++++++++++++++++++++++++++++------
 3 files changed, 84 insertions(+), 13 deletions(-)

diff --git a/core/math/big/common.odin b/core/math/big/common.odin
index fb215cd69..11810d144 100644
--- a/core/math/big/common.odin
+++ b/core/math/big/common.odin
@@ -166,6 +166,10 @@ Error :: enum int {
 	Division_by_Zero        = 8,
 	Math_Domain_Error       = 9,
 
+	Cannot_Open_File        = 50,
+	Cannot_Read_File        = 51,
+	Cannot_Write_File       = 52,
+
 	Unimplemented           = 127,
 };
 
diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index 6b697d472..dfc49bc15 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -86,7 +86,7 @@ print :: proc(name: string, a: ^Int, base := i8(10), print_name := true, newline
 	}
 }
 
-// printf :: fmt.printf;
+printf :: fmt.printf;
 
 demo :: proc() {
 	a, b, c, d, e, f, res := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
@@ -107,6 +107,18 @@ demo :: proc() {
 	print("a(10): ", a, 10, true, true, true);
 	fmt.printf("err: %v\n", err);
 	fmt.printf("RANDOM_PRIME_ITERATIONS_USED: %v\n", RANDOM_PRIME_ITERATIONS_USED);
+
+	// err = internal_int_write_to_ascii_file(a, "a.txt");
+	// if err != nil {
+	// 	fmt.printf("internal_int_write_to_ascii_file returned %v\n", err);
+	// }
+
+	// err = internal_int_read_from_ascii_file(b, "a.txt");
+	// if err != nil {
+	// 	fmt.printf("internal_int_read_from_ascii_file returned %v\n", err);
+	// }
+
+	// print("b: ", b);
 }
 
 main :: proc() {
diff --git a/core/math/big/radix.odin b/core/math/big/radix.odin
index 76854e244..fb1af250b 100644
--- a/core/math/big/radix.odin
+++ b/core/math/big/radix.odin
@@ -16,20 +16,18 @@ package math_big
 
 import "core:intrinsics"
 import "core:mem"
+import "core:os"
 
 /*
-	This version of `itoa` allocates one behalf of the caller. The caller must free the string.
+	This version of `itoa` allocates on behalf of the caller. The caller must free the string.
+	The radix defaults to 10.
 */
-int_itoa_string :: proc(a: ^Int, radix := i8(-1), zero_terminate := false, allocator := context.allocator) -> (res: string, err: Error) {
+int_itoa_string :: proc(a: ^Int, radix := i8(10), zero_terminate := false, allocator := context.allocator) -> (res: string, err: Error) {
 	assert_if_nil(a);
 	context.allocator = allocator;
 
 	a := a; radix := radix;
 	clear_if_uninitialized(a) or_return;
-	/*
-		Radix defaults to 10.
-	*/
-	radix = radix if radix > 0 else 10;
 
 	/*
 		TODO: If we want to write a prefix for some of the radixes, we can oversize the buffer.
@@ -57,18 +55,15 @@ int_itoa_string :: proc(a: ^Int, radix := i8(-1), zero_terminate := false, alloc
 }
 
 /*
-	This version of `itoa` allocates one behalf of the caller. The caller must free the string.
+	This version of `itoa` allocates on behalf of the caller. The caller must free the string.
+	The radix defaults to 10.
 */
-int_itoa_cstring :: proc(a: ^Int, radix := i8(-1), allocator := context.allocator) -> (res: cstring, err: Error) {
+int_itoa_cstring :: proc(a: ^Int, radix := i8(10), allocator := context.allocator) -> (res: cstring, err: Error) {
 	assert_if_nil(a);
 	context.allocator = allocator;
 
 	a := a; radix := radix;
 	clear_if_uninitialized(a) or_return;
-	/*
-		Radix defaults to 10.
-	*/
-	radix = radix if radix > 0 else 10;
 
 	s: string;
 	s, err = int_itoa_string(a, radix, true);
@@ -377,6 +372,66 @@ radix_size :: proc(a: ^Int, radix: i8, zero_terminate := false, allocator := con
 	return size, nil;
 }
 
+/*
+	We might add functions to read and write byte-encoded Ints from/to files, using `int_to_bytes_*` functions.
+
+	LibTomMath allows exporting/importing to/from a file in ASCII, but it doesn't support a much more compact representation in binary, even though it has several pack functions int_to_bytes_* (which I expanded upon and wrote Python interoperable versions of as well), and (un)pack, which is GMP compatible.
+	Someone could implement their own read/write binary int procedures, of course.
+
+	Could be worthwhile to add a canonical binary file representation with an optional small header that says it's an Odin big.Int, big.Rat or Big.Float, byte count for each component that follows, flag for big/little endian and a flag that says a checksum exists at the end of the file.
+	For big.Rat and big.Float the header couldn't be optional, because we'd have no way to distinguish where the components end.
+*/
+
+/*
+	Read an Int from an ASCII file.
+*/
+internal_int_read_from_ascii_file :: proc(a: ^Int, filename: string, radix := i8(10), allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	/*
+		We can either read the entire file at once, or read a bunch at a time and keep multiplying by the radix.
+		For now, we'll read the entire file. Eventually we'll replace this with a copy that duplicates the logic
+		of `atoi` so we don't need to read the entire file.
+	*/
+
+	res, ok := os.read_entire_file(filename, allocator);
+	defer delete(res, allocator);
+
+	if !ok {
+		return .Cannot_Read_File;
+	}
+
+	as := string(res);
+	return atoi(a, as, radix);
+}
+
+/*
+	Write an Int to an ASCII file.
+*/
+internal_int_write_to_ascii_file :: proc(a: ^Int, filename: string, radix := i8(10), allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	/*
+		For now we'll convert the Int using itoa and writing the result in one go.
+		If we want to preserve memory we could duplicate the itoa logic and write backwards.
+	*/
+
+	as := itoa(a, radix) or_return;
+	defer delete(as);
+
+	l := len(as);
+	assert(l > 0);
+
+	data := transmute([]u8)mem.Raw_Slice{
+		data = raw_data(as),
+		len  = l,
+	};
+
+	ok := os.write_entire_file(name=filename, data=data, truncate=true);
+	return nil if ok else .Cannot_Write_File;
+}
+
+
 /*
 	Overestimate the size needed for the bigint to string conversion by a very small amount.
 	The error is about 10^-8; it will overestimate the result by at most 11 elements for

From de5d897b5c11097b34b6c594dac59db475601e91 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Mon, 6 Sep 2021 12:57:48 +0200
Subject: [PATCH 30/33] Add `internal_int_(pack, unpack)`.

---
 core/math/big/common.odin  |   7 ++-
 core/math/big/example.odin |  28 ++++++----
 core/math/big/helpers.odin |  14 ++---
 core/math/big/radix.odin   | 111 +++++++++++++++++++++++++++++++++++++
 4 files changed, 141 insertions(+), 19 deletions(-)

diff --git a/core/math/big/common.odin b/core/math/big/common.odin
index 11810d144..d8dbdcd95 100644
--- a/core/math/big/common.odin
+++ b/core/math/big/common.odin
@@ -186,6 +186,10 @@ Error_String :: #partial [Error]string{
 	.Division_by_Zero        = "Division by zero",
 	.Math_Domain_Error       = "Math domain error",
 
+	.Cannot_Open_File        = "Cannot_Open_File",
+	.Cannot_Read_File        = "Cannot_Read_File",
+	.Cannot_Write_File       = "Cannot_Write_File",
+
 	.Unimplemented           = "Unimplemented",
 };
 
@@ -231,7 +235,8 @@ when MATH_BIG_FORCE_64_BIT || (!MATH_BIG_FORCE_32_BIT && size_of(rawptr) == 8) {
 _DIGIT_TYPE_BITS :: 8 * size_of(DIGIT);
 _WORD_TYPE_BITS  :: 8 * size_of(_WORD);
 
-_DIGIT_BITS      :: _DIGIT_TYPE_BITS - 4;
+_DIGIT_NAILS     :: 4;
+_DIGIT_BITS      :: _DIGIT_TYPE_BITS - _DIGIT_NAILS;
 _WORD_BITS       :: 2 * _DIGIT_BITS;
 
 _MASK            :: (DIGIT(1) << DIGIT(_DIGIT_BITS)) - DIGIT(1);
diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index dfc49bc15..c18ce0cbf 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -86,13 +86,13 @@ print :: proc(name: string, a: ^Int, base := i8(10), print_name := true, newline
 	}
 }
 
-printf :: fmt.printf;
+// printf :: fmt.printf;
 
 demo :: proc() {
 	a, b, c, d, e, f, res := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
 	defer destroy(a, b, c, d, e, f, res);
 
-	bits   := 64;
+	bits   := 111;
 	trials := -1;
 
 	flags := Primality_Flags{};
@@ -108,17 +108,23 @@ demo :: proc() {
 	fmt.printf("err: %v\n", err);
 	fmt.printf("RANDOM_PRIME_ITERATIONS_USED: %v\n", RANDOM_PRIME_ITERATIONS_USED);
 
-	// err = internal_int_write_to_ascii_file(a, "a.txt");
-	// if err != nil {
-	// 	fmt.printf("internal_int_write_to_ascii_file returned %v\n", err);
-	// }
+	nails := 0;
 
-	// err = internal_int_read_from_ascii_file(b, "a.txt");
-	// if err != nil {
-	// 	fmt.printf("internal_int_read_from_ascii_file returned %v\n", err);
-	// }
+	count := internal_int_pack_count(a, u8, nails);
+	buf := make([]u8, count);
+	defer delete(buf);
 
-	// print("b: ", b);
+	written: int;
+	order := Order.LSB_First;
+
+	fmt.printf("\na.digit: %v\n", a.digit[:a.used]);
+	written, err = internal_int_pack(a, buf, nails, order);
+	fmt.printf("\nPacked into buf: %v | err: %v | written: %v\n", buf, err, written);
+
+	err = internal_int_unpack(b, buf, nails, order);
+	print("\nUnpacked into b: ", b);
+	fmt.printf("err: %v\n", err);
+	fmt.printf("b.digit: %v\n", b.digit[:b.used]);
 }
 
 main :: proc() {
diff --git a/core/math/big/helpers.odin b/core/math/big/helpers.odin
index ccd00615f..352367a23 100644
--- a/core/math/big/helpers.odin
+++ b/core/math/big/helpers.odin
@@ -495,7 +495,7 @@ int_to_bytes_little :: proc(a: ^Int, buf: []u8, signed := false, allocator := co
 	if signed {
 		buf[l - 1] = 1 if a.sign == .Negative else 0;
 	}
-	for offset := 0; offset < size_in_bits; offset += 8 {
+	#no_bounds_check for offset := 0; offset < size_in_bits; offset += 8 {
 		bits, _ := internal_int_bitfield_extract(a, offset, 8);
 		buf[i] = u8(bits & 255); i += 1;
 	}
@@ -519,7 +519,7 @@ int_to_bytes_big :: proc(a: ^Int, buf: []u8, signed := false, allocator := conte
 	if signed {
 		buf[0] = 1 if a.sign == .Negative else 0;
 	}
-	for offset := 0; offset < size_in_bits; offset += 8 {
+	#no_bounds_check for offset := 0; offset < size_in_bits; offset += 8 {
 		bits, _ := internal_int_bitfield_extract(a, offset, 8);
 		buf[i] = u8(bits & 255); i -= 1;
 	}
@@ -546,7 +546,7 @@ int_to_bytes_little_python :: proc(a: ^Int, buf: []u8, signed := false, allocato
 
 		size_in_bits := internal_count_bits(t);
 		i := 0;
-		for offset := 0; offset < size_in_bits; offset += 8 {
+		#no_bounds_check for offset := 0; offset < size_in_bits; offset += 8 {
 			bits, _ := internal_int_bitfield_extract(t, offset, 8);
 			buf[i] = 255 - u8(bits & 255); i += 1;
 		}
@@ -554,7 +554,7 @@ int_to_bytes_little_python :: proc(a: ^Int, buf: []u8, signed := false, allocato
 	} else {
 		size_in_bits := internal_count_bits(a);
 		i := 0;
-		for offset := 0; offset < size_in_bits; offset += 8 {
+		#no_bounds_check for offset := 0; offset < size_in_bits; offset += 8 {
 			bits, _ := internal_int_bitfield_extract(a, offset, 8);
 			buf[i] = u8(bits & 255); i += 1;
 		}
@@ -583,7 +583,7 @@ int_to_bytes_big_python :: proc(a: ^Int, buf: []u8, signed := false, allocator :
 
 	size_in_bits := internal_count_bits(t);
 	i := l - 1;
-	for offset := 0; offset < size_in_bits; offset += 8 {
+	#no_bounds_check for offset := 0; offset < size_in_bits; offset += 8 {
 		bits, _ := internal_int_bitfield_extract(t, offset, 8);
 		buf[i] = 255 - u8(bits & 255); i -= 1;
 	}
@@ -620,7 +620,7 @@ int_from_bytes_big :: proc(a: ^Int, buf: []u8, signed := false, allocator := con
 		buf = buf[1:];
 	}
 
-	for v in buf {
+	#no_bounds_check for v in buf {
 		internal_shl(a, a, 8) or_return;
 		a.digit[0] |= DIGIT(v);
 	}
@@ -657,7 +657,7 @@ int_from_bytes_big_python :: proc(a: ^Int, buf: []u8, signed := false, allocator
 		buf = buf[1:];
 	}
 
-	for v in buf {
+	#no_bounds_check for v in buf {
 		internal_shl(a, a, 8) or_return;
 		if signed && sign == .Negative {
 			a.digit[0] |= DIGIT(255 - v);	
diff --git a/core/math/big/radix.odin b/core/math/big/radix.odin
index fb1af250b..f3a9b9d43 100644
--- a/core/math/big/radix.odin
+++ b/core/math/big/radix.odin
@@ -431,6 +431,117 @@ internal_int_write_to_ascii_file :: proc(a: ^Int, filename: string, radix := i8(
 	return nil if ok else .Cannot_Write_File;
 }
 
+/*
+	Calculate the size needed for `internal_int_pack`.
+
+	See https://gmplib.org/manual/Integer-Import-and-Export.html
+*/
+internal_int_pack_count :: proc(a: ^Int, $T: typeid, nails := 0) -> (size_needed: int) {
+	assert(nails >= 0 && nails < (size_of(T) * 8));
+
+	bits := internal_count_bits(a);
+	size := size_of(T);
+
+	size_needed  =  bits / ((size * 8) - nails);
+	size_needed += 1 if (bits % ((size * 8) - nails)) != 0 else 0;
+
+	return size_needed;
+}
+
+/*
+	Based on gmp's mpz_export.
+	See https://gmplib.org/manual/Integer-Import-and-Export.html
+
+	`buf` is a pre-allocated slice of type `T` "words", which must be an unsigned integer of some description.
+		Use `internal_int_pack_count(a, T, nails)` to calculate the necessary size.
+		The library internally uses `DIGIT` as the type, which is u64 or u32 depending on the platform.
+		You are of course welcome to export to []u8, []u32be, and so forth.
+		After this you can use `mem.slice_data_cast` to interpret the buffer as bytes if you so choose.
+
+	`nails` are the number of top bits the output "word" reserves.
+		To mimic the internals of this library, this would be 4.
+
+	To use the minimum amount of output bytes, set `nails` to 0 and pass a `[]u8`.
+	IMPORTANT: `pack` serializes the magnitude of an Int, that is, the output is unsigned.
+
+	Assumes `a` not to be `nil` and to have been initialized.
+*/
+internal_int_pack :: proc(a: ^Int, buf: []$T, nails := 0, order := Order.LSB_First) -> (written: int, err: Error)
+                     where intrinsics.type_is_integer(T) && intrinsics.type_is_unsigned(T) && size_of(T) <= 16 {
+
+	assert(nails >= 0 && nails < (size_of(T) * 8));
+
+	type_size  := size_of(T);
+	type_bits  := (type_size * 8) - nails;
+
+	word_count := internal_int_pack_count(a, T, nails);
+	bit_count  := internal_count_bits(a);
+
+	if len(buf) < word_count {
+		return 0, .Buffer_Overflow;
+	}
+
+	bit_offset  := 0;
+	word_offset := 0;
+
+	#no_bounds_check for i := 0; i < word_count; i += 1 {
+		bit_offset = i * type_bits;
+		if order == .MSB_First {
+			word_offset = word_count - i - 1;
+		} else {
+			word_offset = i;
+		}
+
+		bits_to_get := min(type_bits, bit_count - bit_offset);
+		W := internal_int_bitfield_extract(a, bit_offset, bits_to_get) or_return;
+		buf[word_offset] = T(W);
+	}
+
+	return word_count, nil;
+}
+
+
+
+internal_int_unpack :: proc(a: ^Int, buf: []$T, nails := 0, order := Order.LSB_First, allocator := context.allocator) -> (err: Error)
+                     where intrinsics.type_is_integer(T) && intrinsics.type_is_unsigned(T) && size_of(T) <= 16 {
+	assert(nails >= 0 && nails < (size_of(T) * 8));
+	context.allocator = allocator;
+
+	type_size  := size_of(T);
+	type_bits  := (type_size * 8) - nails;
+	type_mask  := T(1 << uint(type_bits)) - 1;
+
+	if len(buf) == 0 {
+		return .Invalid_Argument;
+	}
+
+	bit_count   := type_bits * len(buf);
+	digit_count := (bit_count / _DIGIT_BITS) + min(1, bit_count % _DIGIT_BITS);
+
+	/*
+		Pre-size output Int.
+	*/
+	internal_grow(a, digit_count) or_return;
+
+	t := &Int{};
+	defer internal_destroy(t);
+
+	if order == .LSB_First {
+		for W, i in buf {
+			internal_set(t, W & type_mask)                           or_return;
+			internal_shl(t, t, type_bits * i)                        or_return;
+			internal_add(a, a, t)                                    or_return;
+		}
+	} else {
+		for W in buf {
+			internal_set(t, W & type_mask)                           or_return;
+			internal_shl(a, a, type_bits)                            or_return;
+			internal_add(a, a, t)                                    or_return;
+		}		
+	}
+
+	return internal_clamp(a);
+}
 
 /*
 	Overestimate the size needed for the bigint to string conversion by a very small amount.

From a3a891a7f4b0c099af49eb180dc8ce2e09527b50 Mon Sep 17 00:00:00 2001
From: gingerBill <bill@gingerbill.org>
Date: Mon, 6 Sep 2021 15:41:09 +0100
Subject: [PATCH 31/33] Add `intrinsics.is_package_imported(<string>)`

---
 core/intrinsics/intrinsics.odin |  3 +++
 src/check_builtin.cpp           | 23 +++++++++++++++++++++++
 src/checker_builtin_procs.hpp   |  7 +++++--
 3 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/core/intrinsics/intrinsics.odin b/core/intrinsics/intrinsics.odin
index 008e2ee5a..28a178db1 100644
--- a/core/intrinsics/intrinsics.odin
+++ b/core/intrinsics/intrinsics.odin
@@ -2,6 +2,9 @@
 //+ignore
 package intrinsics
 
+// Package-Related
+is_package_imported :: proc(package_name: string) -> bool ---
+
 // Types
 simd_vector :: proc($N: int, $T: typeid) -> type/#simd[N]T
 soa_struct :: proc($N: int, $T: typeid) -> type/#soa[N]T
diff --git a/src/check_builtin.cpp b/src/check_builtin.cpp
index 05a5fceda..ffd622e4a 100644
--- a/src/check_builtin.cpp
+++ b/src/check_builtin.cpp
@@ -1871,6 +1871,29 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
 		operand->type = alloc_type_simd_vector(count, elem);
 		break;
 	}
+	
+	case BuiltinProc_is_package_imported: {
+		bool value = false;
+
+		if (!is_type_string(operand->type) && (operand->mode != Addressing_Constant)) {
+			error(ce->args[0], "Expected a constant string for '%.*s'", LIT(builtin_name));
+		} else if (operand->value.kind == ExactValue_String) {
+			String pkg_name = operand->value.value_string;
+			// TODO(bill): probably should have this be a `StringMap` eventually
+			for_array(i, c->info->packages.entries) {
+				AstPackage *pkg = c->info->packages.entries[i].value;
+				if (pkg->name == pkg_name) {
+					value = true;
+					break;
+				}
+			}
+		}
+		
+		operand->mode = Addressing_Constant;
+		operand->type = t_untyped_bool;
+		operand->value = exact_value_bool(value);
+		break;
+	}
 
 	case BuiltinProc_soa_struct: {
 		Operand x = {};
diff --git a/src/checker_builtin_procs.hpp b/src/checker_builtin_procs.hpp
index dc9d139f7..466e679c3 100644
--- a/src/checker_builtin_procs.hpp
+++ b/src/checker_builtin_procs.hpp
@@ -36,6 +36,8 @@ enum BuiltinProcId {
 	BuiltinProc_DIRECTIVE, // NOTE(bill): This is used for specialized hash-prefixed procedures
 
 	// "Intrinsics"
+	BuiltinProc_is_package_imported,
+	
 	BuiltinProc_simd_vector,
 	BuiltinProc_soa_struct,
 
@@ -230,7 +232,6 @@ BuiltinProc__type_simple_boolean_end,
 
 BuiltinProc__type_end,
 
-
 	BuiltinProc_COUNT,
 };
 gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = {
@@ -270,6 +271,8 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = {
 
 
 	// "Intrinsics"
+	{STR_LIT("is_package_imported"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+		
 	{STR_LIT("simd_vector"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, // Type
 	{STR_LIT("soa_struct"),  2, false, Expr_Expr, BuiltinProcPkg_intrinsics}, // Type
 
@@ -459,6 +462,6 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = {
 
 	{STR_LIT("type_equal_proc"),  1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("type_hasher_proc"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
-
+	
 	{STR_LIT(""), 0, false, Expr_Stmt, BuiltinProcPkg_intrinsics},
 };

From 31f779f1a4e9242ffa50ac4fc38fa8dbb10e6078 Mon Sep 17 00:00:00 2001
From: gingerBill <bill@gingerbill.org>
Date: Mon, 6 Sep 2021 15:45:05 +0100
Subject: [PATCH 32/33] `intrinsics.alloca` now returns `[^]u8`

---
 core/intrinsics/intrinsics.odin | 2 +-
 src/check_builtin.cpp           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/intrinsics/intrinsics.odin b/core/intrinsics/intrinsics.odin
index 28a178db1..1aeee5d4c 100644
--- a/core/intrinsics/intrinsics.odin
+++ b/core/intrinsics/intrinsics.odin
@@ -19,7 +19,7 @@ trap       :: proc() -> ! ---
 
 // Instructions
 
-alloca             :: proc(size, align: int) -> ^u8 ---
+alloca             :: proc(size, align: int) -> [^]u8 ---
 cpu_relax          :: proc() ---
 read_cycle_counter :: proc() -> i64 ---
 
diff --git a/src/check_builtin.cpp b/src/check_builtin.cpp
index ffd622e4a..1f3928bd8 100644
--- a/src/check_builtin.cpp
+++ b/src/check_builtin.cpp
@@ -2055,7 +2055,7 @@ bool check_builtin_procedure(CheckerContext *c, Operand *operand, Ast *call, i32
 				error(al.expr, "Alignment parameter to '%.*s' must be constant", LIT(builtin_name));
 			}
 
-			operand->type = t_u8_ptr;
+			operand->type = alloc_type_multi_pointer(t_u8);
 			operand->mode = Addressing_Value;
 			break;
 		}

From bc15ce302c0e095fe8db245194e59adc0533eebe Mon Sep 17 00:00:00 2001
From: gingerBill <bill@gingerbill.org>
Date: Mon, 6 Sep 2021 15:49:51 +0100
Subject: [PATCH 33/33] Add dummy docs for `intrinsics.syscall` on Linux and
 Darwin

---
 core/intrinsics/intrinsics.odin | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/core/intrinsics/intrinsics.odin b/core/intrinsics/intrinsics.odin
index 1aeee5d4c..51fda2458 100644
--- a/core/intrinsics/intrinsics.odin
+++ b/core/intrinsics/intrinsics.odin
@@ -49,6 +49,9 @@ fixed_point_div_sat :: proc(lhs, rhs: $T, #const scale: uint) -> T where type_is
 // Compiler Hints
 expect :: proc(val, expected_val: T) -> T ---
 
+// Linux and Darwin Only
+syscall :: proc(id: uintptr, args: ..uintptr) -> uintptr ---
+
 
 // Atomics
 atomic_fence        :: proc() ---