big: Split up mul into internal and public parts.

2026-08-03 14:18:47 +00:00 · 2021-08-11 20:59:53 +02:00
parent 9858989b1c
commit f8442e0524
6 changed files with 316 additions and 280 deletions
@@ -8,7 +8,7 @@ package big
 	For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
 	The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.

-	This file collects public procs into proc maps, as well as any of their aliases.
+	This file collects public proc maps and their aliases.
 /*

 */
@@ -19,15 +19,18 @@ package big
 */

 /*
-	err = add(dest, a, b);
+	High-level addition. Handles sign.
 */
 add :: proc {
 	/*
-		int_add :: proc(dest, a, b: ^Int) -> (err: Error)
+		int_add :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error)
 	*/
 	int_add,
 	/*
-		int_add_digit :: proc(dest, a: ^Int, digit: DIGIT) -> (err: Error)
+		Adds the unsigned `DIGIT` immediate to an `Int`, such that the
+		`DIGIT` doesn't have to be turned into an `Int` first.
+
+		int_add_digit :: proc(dest, a: ^Int, digit: DIGIT, allocator := context.allocator) -> (err: Error)
 	*/
 	int_add_digit,
 };
@@ -24,6 +24,7 @@ import "core:intrinsics"
 	High-level addition. Handles sign.
 */
 int_add :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
+	if dest == nil || a == nil || b == nil { return .Invalid_Pointer; }
 	if err = clear_if_uninitialized(dest, a, b); err != nil { return err; }
 	/*
 		All parameters have been initialized.
@@ -38,6 +39,7 @@ int_add :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error
 	dest = a + digit;
 */
 int_add_digit :: proc(dest, a: ^Int, digit: DIGIT, allocator := context.allocator) -> (err: Error) {
+	if dest == nil || a == nil { return .Invalid_Pointer; }
 	if err = clear_if_uninitialized(a); err != nil { return err; }
 	/*
 		Grow destination as required.
@@ -54,6 +56,7 @@ int_add_digit :: proc(dest, a: ^Int, digit: DIGIT, allocator := context.allocato
 	High-level subtraction, dest = number - decrease. Handles signs.
 */
 int_sub :: proc(dest, number, decrease: ^Int, allocator := context.allocator) -> (err: Error) {
+	if dest == nil || number == nil || decrease == nil { return .Invalid_Pointer; }
 	if err = clear_if_uninitialized(dest, number, decrease); err != nil { return err; }
 	/*
 		All parameters have been initialized.
@@ -68,6 +71,7 @@ int_sub :: proc(dest, number, decrease: ^Int, allocator := context.allocator) ->
 	dest = a - digit;
 */
 int_sub_digit :: proc(dest, a: ^Int, digit: DIGIT, allocator := context.allocator) -> (err: Error) {
+	if dest == nil || a == nil { return .Invalid_Pointer; }
 	if err = clear_if_uninitialized(a); err != nil { return err; }
 	/*
 		Grow destination as required.
@@ -85,55 +89,14 @@ int_sub_digit :: proc(dest, a: ^Int, digit: DIGIT, allocator := context.allocato
 	dest = src >> 1
 */
 int_halve :: proc(dest, src: ^Int) -> (err: Error) {
-	dest := dest; src := src;
-	if err = clear_if_uninitialized(dest); err != nil {
-		return err;
-	}
+	if dest == nil || src == nil { return .Invalid_Pointer; }
+	if err = clear_if_uninitialized(dest, src); err != nil { return err; }
 	/*
 		Grow destination as required.
 	*/
-	if dest != src {
-		if err = grow(dest, src.used); err != nil {
-			return err;
-		}
-	}
+	if dest != src { if err = grow(dest, src.used + 1); err != nil { return err; } }

-	old_used  := dest.used;
-	dest.used  = src.used;
-
-	/*
-		Carry
-	*/
-	fwd_carry := DIGIT(0);
-
-	for x := dest.used - 1; x >= 0; x -= 1 {
-		/*
-			Get the carry for the next iteration.
-		*/
-		src_digit := src.digit[x];
-		carry     := src_digit & 1;
-		/*
-			Shift the current digit, add in carry and store.
-		*/
-		dest.digit[x] = (src_digit >> 1) | (fwd_carry << (_DIGIT_BITS - 1));
-		/*
-			Forward carry to next iteration.
-		*/
-		fwd_carry = carry;
-	}
-
-	zero_count := old_used - dest.used;
-	/*
-		Zero remainder.
-	*/
-	if zero_count > 0 {
-		mem.zero_slice(dest.digit[dest.used:][:zero_count]);
-	}
-	/*
-		Adjust dest.used based on leading zeroes.
-	*/
-	dest.sign = src.sign;
-	return clamp(dest);
+	return #force_inline internal_int_shr1(dest, src);
 }
 halve :: proc { int_halve, };
 shr1  :: halve;
@@ -143,251 +106,36 @@ shr1  :: halve;
 	dest = src << 1
 */
 int_double :: proc(dest, src: ^Int) -> (err: Error) {
-	dest := dest; src := src;
-	if err = clear_if_uninitialized(dest); err != nil {
-		return err;
-	}
+	if dest == nil || src == nil { return .Invalid_Pointer; }
+	if err = clear_if_uninitialized(dest, src); err != nil { return err; }
 	/*
 		Grow destination as required.
 	*/
-	if dest != src {
-		if err = grow(dest, src.used + 1); err != nil {
-			return err;
-		}
-	}
+	if dest != src { if err = grow(dest, src.used + 1); err != nil { return err; } }

-	old_used  := dest.used;
-	dest.used  = src.used + 1;
-
-	/*
-		Forward carry
-	*/
-	carry := DIGIT(0);
-	for x := 0; x < src.used; x += 1 {
-		/*
-			Get what will be the *next* carry bit from the MSB of the current digit.
-		*/
-		src_digit := src.digit[x];
-		fwd_carry := src_digit >> (_DIGIT_BITS - 1);
-
-		/*
-			Now shift up this digit, add in the carry [from the previous]
-		*/
-		dest.digit[x] = (src_digit << 1 | carry) & _MASK;
-
-		/*
-			Update carry
-		*/
-		carry = fwd_carry;
-	}
-	/*
-		New leading digit?
-	*/
-	if carry != 0 {
-		/*
-			Add a MSB which is always 1 at this point.
-		*/
-		dest.digit[dest.used] = 1;
-	}
-	zero_count := old_used - dest.used;
-	/*
-		Zero remainder.
-	*/
-	if zero_count > 0 {
-		mem.zero_slice(dest.digit[dest.used:][:zero_count]);
-	}
-	/*
-		Adjust dest.used based on leading zeroes.
-	*/
-	dest.sign = src.sign;
-	return clamp(dest);
+	return #force_inline internal_int_shl1(dest, src);
 }
 double :: proc { int_double, };
 shl1   :: double;

-/*
-	remainder = numerator % (1 << bits)
-*/
-int_mod_bits :: proc(remainder, numerator: ^Int, bits: int) -> (err: Error) {
-	if err = clear_if_uninitialized(remainder); err != nil { return err; }
-	if err = clear_if_uninitialized(numerator); err != nil { return err; }
-
-	if bits  < 0 { return .Invalid_Argument; }
-	if bits == 0 { return zero(remainder); }
-
-	/*
-		If the modulus is larger than the value, return the value.
-	*/
-	err = copy(remainder, numerator);
-	if bits >= (numerator.used * _DIGIT_BITS) || err != nil {
-		return;
-	}
-
-	/*
-		Zero digits above the last digit of the modulus.
-	*/
-	zero_count := (bits / _DIGIT_BITS) + 0 if (bits % _DIGIT_BITS == 0) else 1;
-	/*
-		Zero remainder.
-	*/
-	if zero_count > 0 {
-		mem.zero_slice(remainder.digit[zero_count:]);
-	}
-
-	/*
-		Clear the digit that is not completely outside/inside the modulus.
-	*/
-	remainder.digit[bits / _DIGIT_BITS] &= DIGIT(1 << DIGIT(bits % _DIGIT_BITS)) - DIGIT(1);
-	return clamp(remainder);
-}
-mod_bits :: proc { int_mod_bits, };
-
 /*
 	Multiply by a DIGIT.
 */
-int_mul_digit :: proc(dest, src: ^Int, multiplier: DIGIT) -> (err: Error) {
+int_mul_digit :: proc(dest, src: ^Int, multiplier: DIGIT, allocator := context.allocator) -> (err: Error) {
+	if dest == nil || src == nil { return .Invalid_Pointer; }
 	if err = clear_if_uninitialized(src, dest); err != nil { return err; }

-	if multiplier == 0 {
-		return zero(dest);
-	}
-	if multiplier == 1 {
-		return copy(dest, src);
-	}
-
-	/*
-		Power of two?
-	*/
-	if multiplier == 2 {
-		return double(dest, src);
-	}
-	if is_power_of_two(int(multiplier)) {
-		ix: int;
-		if ix, err = log(multiplier, 2); err != nil { return err; }
-		return shl(dest, src, ix);
-	}
-
-	/*
-		Ensure `dest` is big enough to hold `src` * `multiplier`.
-	*/
-	if err = grow(dest, max(src.used + 1, _DEFAULT_DIGIT_COUNT)); err != nil { return err; }
-
-	/*
-		Save the original used count.
-	*/
-	old_used := dest.used;
-	/*
-		Set the sign.
-	*/
-	dest.sign = src.sign;
-	/*
-		Set up carry.
-	*/
-	carry := _WORD(0);
-	/*
-		Compute columns.
-	*/
-	ix := 0;
-	for ; ix < src.used; ix += 1 {
-		/*
-			Compute product and carry sum for this term
-		*/
-		product := carry + _WORD(src.digit[ix]) * _WORD(multiplier);
-		/*
-			Mask off higher bits to get a single DIGIT.
-		*/
-		dest.digit[ix] = DIGIT(product & _WORD(_MASK));
-		/*
-			Send carry into next iteration
-		*/
-		carry = product >> _DIGIT_BITS;
-	}
-
-	/*
-		Store final carry [if any] and increment used.
-	*/
-	dest.digit[ix] = DIGIT(carry);
-	dest.used = src.used + 1;
-	/*
-		Zero unused digits.
-	*/
-	zero_count := old_used - dest.used;
-	if zero_count > 0 {
-		mem.zero_slice(dest.digit[zero_count:]);
-	}
-	return clamp(dest);
+	return #force_inline internal_int_mul_digit(dest, src, multiplier, allocator);
 }

 /*
 	High level multiplication (handles sign).
 */
-int_mul :: proc(dest, src, multiplier: ^Int) -> (err: Error) {
+int_mul :: proc(dest, src, multiplier: ^Int, allocator := context.allocator) -> (err: Error) {
+	if dest == nil || src == nil || multiplier == nil { return .Invalid_Pointer; }
 	if err = clear_if_uninitialized(dest, src, multiplier); err != nil { return err; }

-	/*
-		Early out for `multiplier` is zero; Set `dest` to zero.
-	*/
-	if z, _ := is_zero(multiplier); z { return zero(dest); }
-	if z, _ := is_zero(src);        z { return zero(dest); }
-
-	if src == multiplier {
-		/*
-			Do we need to square?
-		*/
-		if        false && src.used >= _SQR_TOOM_CUTOFF {
-			/* Use Toom-Cook? */
-			// err = s_mp_sqr_toom(a, c);
-		} else if false && src.used >= _SQR_KARATSUBA_CUTOFF {
-			/* Karatsuba? */
-			// err = s_mp_sqr_karatsuba(a, c);
-		} else if false && ((src.used * 2) + 1) < _WARRAY &&
-		                   src.used < (_MAX_COMBA / 2) {
-			/* Fast comba? */
-			// err = s_mp_sqr_comba(a, c);
-		} else {
-			err = _int_sqr(dest, src);
-		}
-	} else {
-		/*
-			Can we use the balance method? Check sizes.
-			* The smaller one needs to be larger than the Karatsuba cut-off.
-			* The bigger one needs to be at least about one `_MUL_KARATSUBA_CUTOFF` bigger
-			* to make some sense, but it depends on architecture, OS, position of the
-			* stars... so YMMV.
-			* Using it to cut the input into slices small enough for _mul_comba
-			* was actually slower on the author's machine, but YMMV.
-		*/
-
-		min_used := min(src.used, multiplier.used);
-		max_used := max(src.used, multiplier.used);
-		digits   := src.used + multiplier.used + 1;
-
-		if        false &&  min_used     >= _MUL_KARATSUBA_CUTOFF &&
-						    max_used / 2 >= _MUL_KARATSUBA_CUTOFF &&
-			/*
-				Not much effect was observed below a ratio of 1:2, but again: YMMV.
-			*/
-							max_used     >= 2 * min_used {
-			// err = s_mp_mul_balance(a,b,c);
-		} else if false && min_used >= _MUL_TOOM_CUTOFF {
-			// err = s_mp_mul_toom(a, b, c);
-		} else if false && min_used >= _MUL_KARATSUBA_CUTOFF {
-			// err = s_mp_mul_karatsuba(a, b, c);
-		} else if digits < _WARRAY && min_used <= _MAX_COMBA {
-			/*
-				Can we use the fast multiplier?
-				* The fast multiplier can be used if the output will
-				* have less than MP_WARRAY digits and the number of
-				* digits won't affect carry propagation
-			*/
-			err = _int_mul_comba(dest, src, multiplier, digits);
-		} else {
-			err = _int_mul(dest, src, multiplier, digits);
-		}
-	}
-	neg      := src.sign != multiplier.sign;
-	dest.sign = .Negative if dest.used > 0 && neg else .Zero_or_Positive;
-	return err;
+	return #force_inline internal_int_mul(dest, src, multiplier, allocator);
 }

 mul :: proc { int_mul, int_mul_digit, };
@@ -1219,6 +967,7 @@ _int_div_digit :: proc(quotient, numerator: ^Int, denominator: DIGIT) -> (remain
 	Function computing both GCD and (if target isn't `nil`) also LCM.
 */
 int_gcd_lcm :: proc(res_gcd, res_lcm, a, b: ^Int) -> (err: Error) {
+	if res_gcd == nil && res_lcm == nil { return nil; }
 	if err = clear_if_uninitialized(res_gcd, res_lcm, a, b); err != nil { return err; }

 	az, _ := is_zero(a); bz, _ := is_zero(b);
@@ -1435,6 +1184,43 @@ _int_gcd_lcm :: proc(res_gcd, res_lcm, a, b: ^Int) -> (err: Error) {
 	return err;
 }

+/*
+	remainder = numerator % (1 << bits)
+*/
+int_mod_bits :: proc(remainder, numerator: ^Int, bits: int) -> (err: Error) {
+	if err = clear_if_uninitialized(remainder); err != nil { return err; }
+	if err = clear_if_uninitialized(numerator); err != nil { return err; }
+
+	if bits  < 0 { return .Invalid_Argument; }
+	if bits == 0 { return zero(remainder); }
+
+	/*
+		If the modulus is larger than the value, return the value.
+	*/
+	err = copy(remainder, numerator);
+	if bits >= (numerator.used * _DIGIT_BITS) || err != nil {
+		return;
+	}
+
+	/*
+		Zero digits above the last digit of the modulus.
+	*/
+	zero_count := (bits / _DIGIT_BITS) + 0 if (bits % _DIGIT_BITS == 0) else 1;
+	/*
+		Zero remainder.
+	*/
+	if zero_count > 0 {
+		mem.zero_slice(remainder.digit[zero_count:]);
+	}
+
+	/*
+		Clear the digit that is not completely outside/inside the modulus.
+	*/
+	remainder.digit[bits / _DIGIT_BITS] &= DIGIT(1 << DIGIT(bits % _DIGIT_BITS)) - DIGIT(1);
+	return clamp(remainder);
+}
+mod_bits :: proc { int_mod_bits, };
+

 when MATH_BIG_FORCE_64_BIT || (!MATH_BIG_FORCE_32_BIT && size_of(rawptr) == 8) {
 	_factorial_table := [35]_WORD{
@@ -287,7 +287,7 @@ int_grow :: proc(a: ^Int, digits: int, allow_shrink := false, allocator := conte

 	/*
 		If not yet iniialized, initialize the `digit` backing with the allocator we were passed.
-		Otherwise, `[dynamic]DIGIT` already knows what allocator was used for it, so reuse will do the right thing.
+		Otherwise, `[dynamic]DIGIT` already knows what allocator was used for it, so resize will do the right thing.
 	*/
 	if raw.cap == 0 {
 		a.digit = mem.make_dynamic_array_len_cap([dynamic]DIGIT, needed, needed, allocator);
@@ -314,7 +314,7 @@ int_clear :: proc(a: ^Int, minimize := false, allocator := context.allocator) ->

 	raw := transmute(mem.Raw_Dynamic_Array)a.digit;
 	if raw.cap != 0 {
-		mem.zero_slice(a.digit[:]);
+		mem.zero_slice(a.digit[:a.used]);
 	}
 	a.sign = .Zero_or_Positive;
 	a.used = 0;
@@ -328,7 +328,7 @@ zero  :: clear;
 	Set the `Int` to 1 and optionally shrink it to the minimum backing size.
 */
 int_one :: proc(a: ^Int, minimize := false, allocator := context.allocator) -> (err: Error) {
-	return set(a, 1, minimize, allocator);
+	return copy(a, ONE, minimize, allocator);
 }
 one :: proc { int_one, };

@@ -597,6 +597,7 @@ _zero_unused :: proc(a: ^Int) {

 clear_if_uninitialized_single :: proc(arg: ^Int) -> (err: Error) {
 	if !is_initialized(arg) {
+		if arg == nil { return nil; }
 		return grow(arg, _DEFAULT_DIGIT_COUNT);
 	}
 	return err;
@@ -604,7 +605,8 @@ clear_if_uninitialized_single :: proc(arg: ^Int) -> (err: Error) {

 clear_if_uninitialized_multi :: proc(args: ..^Int) -> (err: Error) {
 	for i in args {
-		if i != nil && !is_initialized(i) {
+		if i == nil { continue; }
+		if !is_initialized(i) {
 			e := grow(i, _DEFAULT_DIGIT_COUNT);
 			if e != nil { err = e; }
 		}
@@ -690,9 +692,9 @@ initialize_constants :: proc() -> (res: int) {
 		We set these special values to -1 or 1 so they don't get mistake for zero accidentally.
 		This allows for shortcut tests of is_zero as .used == 0.
 	*/
+	set(      NAN,  1);       NAN.flags = {.Immutable, .NaN};
 	set(      INF,  1);       INF.flags = {.Immutable, .Inf};
 	set(      INF, -1); MINUS_INF.flags = {.Immutable, .Inf};
-	set(      NAN,  1);       NAN.flags = {.Immutable, .NaN};

 	return #config(MUL_KARATSUBA_CUTOFF, _DEFAULT_MUL_KARATSUBA_CUTOFF);
 }
@@ -443,4 +443,241 @@ internal_int_sub_digit :: proc(dest, number: ^Int, digit: DIGIT) -> (err: Error)
 	return clamp(dest);
 }

-internal_sub :: proc { internal_int_sub_signed, internal_int_sub_digit, };
+internal_sub :: proc { internal_int_sub_signed, internal_int_sub_digit, };
+
+/*
+	dest = src  / 2
+	dest = src >> 1
+*/
+internal_int_shr1 :: proc(dest, src: ^Int) -> (err: Error) {
+	old_used  := dest.used; dest.used = src.used;
+	/*
+		Carry
+	*/
+	fwd_carry := DIGIT(0);
+
+	#no_bounds_check for x := dest.used - 1; x >= 0; x -= 1 {
+		/*
+			Get the carry for the next iteration.
+		*/
+		src_digit := src.digit[x];
+		carry     := src_digit & 1;
+		/*
+			Shift the current digit, add in carry and store.
+		*/
+		dest.digit[x] = (src_digit >> 1) | (fwd_carry << (_DIGIT_BITS - 1));
+		/*
+			Forward carry to next iteration.
+		*/
+		fwd_carry = carry;
+	}
+
+	zero_count := old_used - dest.used;
+	/*
+		Zero remainder.
+	*/
+	if zero_count > 0 {
+		mem.zero_slice(dest.digit[dest.used:][:zero_count]);
+	}
+	/*
+		Adjust dest.used based on leading zeroes.
+	*/
+	dest.sign = src.sign;
+	return clamp(dest);	
+}
+
+/*
+	dest = src  * 2
+	dest = src << 1
+*/
+internal_int_shl1 :: proc(dest, src: ^Int) -> (err: Error) {
+	old_used  := dest.used; dest.used  = src.used + 1;
+
+	/*
+		Forward carry
+	*/
+	carry := DIGIT(0);
+	#no_bounds_check for x := 0; x < src.used; x += 1 {
+		/*
+			Get what will be the *next* carry bit from the MSB of the current digit.
+		*/
+		src_digit := src.digit[x];
+		fwd_carry := src_digit >> (_DIGIT_BITS - 1);
+
+		/*
+			Now shift up this digit, add in the carry [from the previous]
+		*/
+		dest.digit[x] = (src_digit << 1 | carry) & _MASK;
+
+		/*
+			Update carry
+		*/
+		carry = fwd_carry;
+	}
+	/*
+		New leading digit?
+	*/
+	if carry != 0 {
+		/*
+			Add a MSB which is always 1 at this point.
+		*/
+		dest.digit[dest.used] = 1;
+	}
+	zero_count := old_used - dest.used;
+	/*
+		Zero remainder.
+	*/
+	if zero_count > 0 {
+		mem.zero_slice(dest.digit[dest.used:][:zero_count]);
+	}
+	/*
+		Adjust dest.used based on leading zeroes.
+	*/
+	dest.sign = src.sign;
+	return clamp(dest);
+}
+
+/*
+	Multiply by a DIGIT.
+*/
+internal_int_mul_digit :: proc(dest, src: ^Int, multiplier: DIGIT, allocator := context.allocator) -> (err: Error) {
+	if multiplier == 0 {
+		return zero(dest);
+	}
+	if multiplier == 1 {
+		return copy(dest, src);
+	}
+
+	/*
+		Power of two?
+	*/
+	if multiplier == 2 {
+		return #force_inline shl1(dest, src);
+	}
+	if is_power_of_two(int(multiplier)) {
+		ix: int;
+		if ix, err = log(multiplier, 2); err != nil { return err; }
+		return shl(dest, src, ix);
+	}
+
+	/*
+		Ensure `dest` is big enough to hold `src` * `multiplier`.
+	*/
+	if err = grow(dest, max(src.used + 1, _DEFAULT_DIGIT_COUNT), false, allocator); err != nil { return err; }
+
+	/*
+		Save the original used count.
+	*/
+	old_used := dest.used;
+	/*
+		Set the sign.
+	*/
+	dest.sign = src.sign;
+	/*
+		Set up carry.
+	*/
+	carry := _WORD(0);
+	/*
+		Compute columns.
+	*/
+	ix := 0;
+	#no_bounds_check for ; ix < src.used; ix += 1 {
+		/*
+			Compute product and carry sum for this term
+		*/
+		product := carry + _WORD(src.digit[ix]) * _WORD(multiplier);
+		/*
+			Mask off higher bits to get a single DIGIT.
+		*/
+		dest.digit[ix] = DIGIT(product & _WORD(_MASK));
+		/*
+			Send carry into next iteration
+		*/
+		carry = product >> _DIGIT_BITS;
+	}
+
+	/*
+		Store final carry [if any] and increment used.
+	*/
+	dest.digit[ix] = DIGIT(carry);
+	dest.used = src.used + 1;
+	/*
+		Zero unused digits.
+	*/
+	zero_count := old_used - dest.used;
+	if zero_count > 0 {
+		mem.zero_slice(dest.digit[zero_count:]);
+	}
+	return clamp(dest);
+}
+
+/*
+	High level multiplication (handles sign).
+*/
+internal_int_mul :: proc(dest, src, multiplier: ^Int, allocator := context.allocator) -> (err: Error) {
+	/*
+		Early out for `multiplier` is zero; Set `dest` to zero.
+	*/
+	if multiplier.used == 0 || src.used == 0 { return zero(dest); }
+
+	if src == multiplier {
+		/*
+			Do we need to square?
+		*/
+		if        false && src.used >= _SQR_TOOM_CUTOFF {
+			/* Use Toom-Cook? */
+			// err = s_mp_sqr_toom(a, c);
+		} else if false && src.used >= _SQR_KARATSUBA_CUTOFF {
+			/* Karatsuba? */
+			// err = s_mp_sqr_karatsuba(a, c);
+		} else if false && ((src.used * 2) + 1) < _WARRAY &&
+		                   src.used < (_MAX_COMBA / 2) {
+			/* Fast comba? */
+			// err = s_mp_sqr_comba(a, c);
+		} else {
+			err = _int_sqr(dest, src);
+		}
+	} else {
+		/*
+			Can we use the balance method? Check sizes.
+			* The smaller one needs to be larger than the Karatsuba cut-off.
+			* The bigger one needs to be at least about one `_MUL_KARATSUBA_CUTOFF` bigger
+			* to make some sense, but it depends on architecture, OS, position of the
+			* stars... so YMMV.
+			* Using it to cut the input into slices small enough for _mul_comba
+			* was actually slower on the author's machine, but YMMV.
+		*/
+
+		min_used := min(src.used, multiplier.used);
+		max_used := max(src.used, multiplier.used);
+		digits   := src.used + multiplier.used + 1;
+
+		if        false &&  min_used     >= _MUL_KARATSUBA_CUTOFF &&
+						    max_used / 2 >= _MUL_KARATSUBA_CUTOFF &&
+			/*
+				Not much effect was observed below a ratio of 1:2, but again: YMMV.
+			*/
+							max_used     >= 2 * min_used {
+			// err = s_mp_mul_balance(a,b,c);
+		} else if false && min_used >= _MUL_TOOM_CUTOFF {
+			// err = s_mp_mul_toom(a, b, c);
+		} else if false && min_used >= _MUL_KARATSUBA_CUTOFF {
+			// err = s_mp_mul_karatsuba(a, b, c);
+		} else if digits < _WARRAY && min_used <= _MAX_COMBA {
+			/*
+				Can we use the fast multiplier?
+				* The fast multiplier can be used if the output will
+				* have less than MP_WARRAY digits and the number of
+				* digits won't affect carry propagation
+			*/
+			err = _int_mul_comba(dest, src, multiplier, digits);
+		} else {
+			err = _int_mul(dest, src, multiplier, digits);
+		}
+	}
+	neg      := src.sign != multiplier.sign;
+	dest.sign = .Negative if dest.used > 0 && neg else .Zero_or_Positive;
+	return err;
+}
+
+internal_mul :: proc { internal_int_mul, internal_int_mul_digit, };
@@ -24,6 +24,11 @@ PyRes :: struct {
 	err: Error,
 }

+@export test_initialize_constants :: proc "c" () -> (res: int) {
+	context = runtime.default_context();
+	return initialize_constants();
+}
+
@export test_error_string :: proc "c" (err: Error) -> (res: cstring) {
 	context = runtime.default_context();
 	es := Error_String;
@@ -113,6 +113,9 @@ def load(export_name, args, res):
 class Res(Structure):
 	_fields_ = [("res", c_char_p), ("err", c_uint64)]

+initialize_constants = load(l.test_initialize_constants, [], c_uint64)
+initialize_constants()
+
 error_string = load(l.test_error_string, [c_byte], c_char_p)

 add  = load(l.test_add, [c_char_p, c_char_p], Res)