From 2110778040787234041ee977ec456e5098f2caf7 Mon Sep 17 00:00:00 2001
From: Jeroen van Rijn <Kelimion@users.noreply.github.com>
Date: Tue, 31 Aug 2021 21:26:19 +0200
Subject: [PATCH] big: Add `internal_int_exponent_mod_fast`.

---
 core/math/big/example.odin  |   6 +-
 core/math/big/internal.odin |  25 ++--
 core/math/big/prime.odin    | 283 +++++++++++++++++++++++++++++++++++-
 core/math/big/private.odin  |   5 +-
 4 files changed, 302 insertions(+), 17 deletions(-)

diff --git a/core/math/big/example.odin b/core/math/big/example.odin
index 18b6062d9..4a64aaa01 100644
--- a/core/math/big/example.odin
+++ b/core/math/big/example.odin
@@ -208,15 +208,17 @@ int_to_byte_little :: proc(v: ^Int) {
 	}
 }
 
+// printf :: fmt.printf;
+
 demo :: proc() {
 	a, b, c, d, e, f, res := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{};
 	defer destroy(a, b, c, d, e, f, res);
 
 	set(a, 42);
 	set(b, 6);
-	set(c, 5);
+	set(c, 131);
 
-	if err := internal_int_exponent_mod(res, a, b, c, 0); err != nil {
+	if err := internal_int_exponent_mod_fast(res, a, b, c, 0); err != nil {
 		fmt.printf("Error: %v\n", err);
 	}
 
diff --git a/core/math/big/internal.odin b/core/math/big/internal.odin
index 789163af2..ca113275c 100644
--- a/core/math/big/internal.odin
+++ b/core/math/big/internal.odin
@@ -991,13 +991,21 @@ internal_int_mod_bits :: proc(remainder, numerator: ^Int, bits: int, allocator :
 	public ones that have already satisfied these constraints.
 */
 
+/*
+	This procedure returns the allocated capacity of an Int.
+	Assumes `a` not to be `nil`.
+*/
+internal_int_allocated_cap :: #force_inline proc(a: ^Int) -> (cap: int) {
+	raw := transmute(mem.Raw_Dynamic_Array)a.digit;
+	return raw.cap;
+}
+
 /*
 	This procedure will return `true` if the `Int` is initialized, `false` if not.
 	Assumes `a` not to be `nil`.
 */
 internal_int_is_initialized :: #force_inline proc(a: ^Int) -> (initialized: bool) {
-	raw := transmute(mem.Raw_Dynamic_Array)a.digit;
-	return raw.cap >= _MIN_DIGIT_COUNT;
+	return internal_int_allocated_cap(a) >= _MIN_DIGIT_COUNT;
 }
 internal_is_initialized :: proc { internal_int_is_initialized, };
 
@@ -1650,8 +1658,7 @@ internal_int_destroy :: proc(integers: ..^Int) {
 	integers := integers;
 
 	for a in &integers {
-		raw := transmute(mem.Raw_Dynamic_Array)a.digit;
-		if raw.cap > 0 {
+		if internal_int_allocated_cap(a) > 0 {
 			mem.zero_slice(a.digit[:]);
 			free(&a.digit[0]);
 		}
@@ -1913,23 +1920,23 @@ internal_int_shrink :: proc(a: ^Int) -> (err: Error) {
 internal_shrink :: proc { internal_int_shrink, };
 
 internal_int_grow :: proc(a: ^Int, digits: int, allow_shrink := false, allocator := context.allocator) -> (err: Error) {
-	raw := transmute(mem.Raw_Dynamic_Array)a.digit;
-
 	/*
 		We need at least _MIN_DIGIT_COUNT or a.used digits, whichever is bigger.
 		The caller is asking for `digits`. Let's be accomodating.
 	*/
+	cap := internal_int_allocated_cap(a);
+
 	needed := max(_MIN_DIGIT_COUNT, a.used, digits);
 	if !allow_shrink {
-		needed = max(needed, raw.cap);
+		needed = max(needed, cap);
 	}
 
 	/*
 		If not yet iniialized, initialize the `digit` backing with the allocator we were passed.
 	*/
-	if raw.cap == 0 {
+	if cap == 0 {
 		a.digit = make([dynamic]DIGIT, needed, allocator);
-	} else if raw.cap != needed {
+	} else if cap != needed {
 		/*
 			`[dynamic]DIGIT` already knows what allocator was used for it, so resize will do the right thing.
 		*/
diff --git a/core/math/big/prime.odin b/core/math/big/prime.odin
index 1947ac634..9771ef077 100644
--- a/core/math/big/prime.odin
+++ b/core/math/big/prime.odin
@@ -144,7 +144,7 @@ internal_int_montgomery_calc_normalization :: proc(a, b: ^Int, allocator := cont
 		power := ((b.used - 1) * _DIGIT_BITS) + bits - 1;
 		internal_int_power_of_two(a, power)                          or_return;
 	} else {
-		internal_one(a);
+		internal_one(a)                                              or_return;
 		bits = 1;
 	}
 
@@ -187,7 +187,8 @@ internal_int_montgomery_setup :: proc(n: ^Int) -> (rho: DIGIT, err: Error) {
 	x := (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
 	x *= 2 - (b * x);              /* here x*a==1 mod 2**8 */
 	x *= 2 - (b * x);              /* here x*a==1 mod 2**16 */
-	when _WORD_TYPE_BITS == 64 {
+
+	when _DIGIT_TYPE_BITS == 64 {
 		x *= 2 - (b * x);              /* here x*a==1 mod 2**32 */
 		x *= 2 - (b * x);              /* here x*a==1 mod 2**64 */
 	}
@@ -473,6 +474,10 @@ internal_int_exponent_mod :: proc(res, G, X, P: ^Int, redmode: int, allocator :=
 	M := [_TAB_SIZE]Int{};
 	winsize: uint;
 
+	/*
+		Use a pointer to the reduction algorithm.
+		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
+	*/
 	redux: #type proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error);
 
 	defer {
@@ -686,6 +691,280 @@ internal_int_exponent_mod :: proc(res, G, X, P: ^Int, redmode: int, allocator :=
 	return err;
 }
 
+/*
+	Computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
+
+	Uses a left-to-right `k`-ary sliding window to compute the modular exponentiation.
+	The value of `k` changes based on the size of the exponent.
+
+	Uses Montgomery or Diminished Radix reduction [whichever appropriate]
+
+	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
+*/
+internal_int_exponent_mod_fast :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator;
+
+	M := [_TAB_SIZE]Int{};
+	winsize: uint;
+
+	/*
+		Use a pointer to the reduction algorithm.
+		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
+	*/
+	redux: #type proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error);
+
+	defer {
+		internal_destroy(&M[1]);
+		for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+			internal_destroy(&M[x]);
+		}
+	}
+
+	/*
+		Find window size.
+	*/
+	x := internal_count_bits(X);
+	switch {
+	case x <= 7:
+		winsize = 2;
+	case x <= 36:
+		winsize = 3;
+	case x <= 140:
+		winsize = 4;
+	case x <= 450:
+		winsize = 5;
+	case x <= 1303:
+		winsize = 6;
+	case x <= 3529:
+		winsize = 7;
+	case:
+		winsize = 8;
+	}
+
+	winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize;
+
+	/*
+		Init M array
+		Init first cell.
+	*/
+	cap := internal_int_allocated_cap(P);
+	internal_grow(&M[1], cap)                                        or_return;
+
+	/*
+		Now init the second half of the array.
+	*/
+	for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+		internal_grow(&M[x], cap)                                    or_return;
+	}
+
+	/*
+		Determine and setup reduction code.
+	*/
+	rho: DIGIT;
+
+	if redmode == 0 {
+		/*
+			Now setup Montgomery.
+		*/
+		rho = internal_int_montgomery_setup(P)                       or_return;
+
+		/*
+			Automatically pick the comba one if available (saves quite a few calls/ifs).
+		*/
+		if ((P.used * 2) + 1) < _WARRAY && P.used < _MAX_COMBA {
+			redux = _private_montgomery_reduce_comba;
+		} else {
+			/*
+				Use slower baseline Montgomery method.
+			*/
+			redux = internal_int_montgomery_reduce;
+		}
+	} else if redmode == 1 {
+		/*
+		if (MP_HAS(MP_DR_SETUP) && MP_HAS(MP_DR_REDUCE)) {
+			/* setup DR reduction for moduli of the form B**k - b */
+			mp_dr_setup(P, &mp);
+			redux = mp_dr_reduce;
+		} else {
+			err = MP_VAL;
+			goto LBL_M;
+		}
+		*/
+		return .Unimplemented;
+	} else {
+		/*
+			Setup DR reduction for moduli of the form 2**k - b.
+		*/
+		rho = internal_int_reduce_2k_setup(P)                        or_return;
+		redux = internal_int_reduce_2k;
+	}
+
+	/*
+		Setup result.
+	*/
+	internal_grow(res, cap)                                          or_return;
+
+	/*
+		Create M table
+		The first half of the table is not computed, though, except for M[0] and M[1]
+	*/
+
+	if redmode == 0 {
+		/*
+			Now we need R mod m.
+		*/
+		internal_int_montgomery_calc_normalization(res, P)           or_return;
+
+		/*
+			Now set M[1] to G * R mod m.
+		*/
+		internal_mulmod(&M[1], G, res, P)                            or_return;
+	} else {
+		internal_one(res)                                            or_return;
+		internal_mod(&M[1], G, P)                                    or_return;
+	}
+
+	/*
+		Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
+	*/
+	slot := 1 << (winsize - 1);
+	internal_copy(&M[slot], &M[1])                                   or_return;
+
+	for x = 0; x < int(winsize - 1); x += 1 {
+		internal_sqr(&M[slot], &M[slot])                             or_return;
+   		print("slot: ", &M[slot]);
+		redux(&M[slot], P, rho)                                      or_return;
+		print("slot redux: ", &M[slot]);
+	}
+
+	/*
+		Create upper table.
+	*/
+	for x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x += 1 {
+		internal_mul(&M[x], &M[x - 1], &M[1])                        or_return;
+		redux(&M[x], P, rho)                                         or_return;
+	}
+
+	/*
+		Set initial mode and bit cnt.
+	*/
+	mode   := 0;
+	bitcnt := 1;
+	buf    := DIGIT(0);
+	digidx := X.used - 1;
+	bitcpy := 0;
+	bitbuf := DIGIT(0);
+
+	for {
+		/*
+			Grab next digit as required.
+		*/
+		bitcnt -= 1;
+		if bitcnt == 0 {
+			/*
+				If digidx == -1 we are out of digits so break.
+			*/
+			if digidx == -1 { break; }
+
+			/*
+				Read next digit and reset the bitcnt.
+			*/
+			buf    = X.digit[digidx];
+			digidx -= 1;
+			bitcnt = _DIGIT_BITS;
+		}
+
+		/*
+			Grab the next msb from the exponent.
+		*/
+		y := (buf >> (_DIGIT_BITS - 1)) & 1;
+		buf <<= 1;
+
+		/*
+			If the bit is zero and mode == 0 then we ignore it.
+			These represent the leading zero bits before the first 1 bit in the exponent.
+			Technically this opt is not required but it does lower the # of trivial squaring/reductions used.
+		*/
+		if mode == 0 && y == 0 { continue; }
+
+		/*
+			If the bit is zero and mode == 1 then we square.
+		*/
+		if mode == 1 && y == 0 {
+			internal_sqr(res, res)                                   or_return;
+			redux(res, P, rho)                                       or_return;
+			continue;
+		}
+
+		/*
+			Else we add it to the window.
+		*/
+		bitcpy += 1;
+		bitbuf |= (y << (winsize - uint(bitcpy)));
+		mode    = 2;
+
+		if bitcpy == int(winsize) {
+			/*
+				Window is filled so square as required and multiply
+				Square first.
+			*/
+			for x = 0; x < int(winsize); x += 1 {
+				internal_sqr(res, res)                               or_return;
+				redux(res, P, rho)                                   or_return;
+			}
+
+			/*
+				Then multiply.
+			*/
+			internal_mul(res, res, &M[bitbuf])                       or_return;
+			redux(res, P, rho)                                       or_return;
+
+			/*
+				Empty window and reset.
+			*/
+			bitcpy = 0;
+			bitbuf = 0;
+			mode   = 1;
+		}
+	}
+
+	/*
+		If bits remain then square/multiply.
+	*/
+	if mode == 2 && bitcpy > 0 {
+		/*
+			Square then multiply if the bit is set.
+		*/
+		for x = 0; x < bitcpy; x += 1 {
+			internal_sqr(res, res)                                   or_return;
+			redux(res, P, rho)                                       or_return;
+
+			/*
+				Get next bit of the window.
+			*/
+			bitbuf <<= 1;
+			if bitbuf & (1 << winsize) != 0 {
+				/*
+					Then multiply.
+				*/
+				internal_mul(res, res, &M[1])                        or_return;
+				redux(res, P, rho)                                   or_return;
+			}
+		}
+	}
+
+	if redmode == 0 {
+		/*
+			Fixup result if Montgomery reduction is used.
+			Recall that any value in a Montgomery system is actually multiplied by R mod n.
+			So we have to reduce one more time to cancel out the factor of R.
+		*/
+		redux(res, P, rho)                                           or_return;
+	}
+
+	return nil;
+}
+
 /*
 	Returns the number of Rabin-Miller trials needed for a given bit size.
 */
diff --git a/core/math/big/private.odin b/core/math/big/private.odin
index 7e839337f..552b100cf 100644
--- a/core/math/big/private.odin
+++ b/core/math/big/private.odin
@@ -1730,9 +1730,6 @@ _private_int_log :: proc(a: ^Int, base: DIGIT, allocator := context.allocator) -
 	return;
 }
 
-
-
-
 /*
 	Computes xR**-1 == x (mod N) via Montgomery Reduction.
 	This is an optimized implementation of `internal_montgomery_reduce`
@@ -1753,7 +1750,7 @@ _private_montgomery_reduce_comba :: proc(x, n: ^Int, rho: DIGIT, allocator := co
 	/*
 		Grow `x` as required.
 	*/
-	internal_grow(x, n.used + 1) or_return;
+	internal_grow(x, n.used + 1)                                     or_return;
 
 	/*
 		First we have to get the digits of the input into an array of double precision words W[...]