diff --git a/base/runtime/entry_unix.odin b/base/runtime/entry_unix.odin
index 233007936..e49698e6e 100644
--- a/base/runtime/entry_unix.odin
+++ b/base/runtime/entry_unix.odin
@@ -1,5 +1,5 @@
 //+private
-//+build linux, darwin, freebsd, openbsd
+//+build linux, darwin, freebsd, openbsd, haiku
 //+no-instrumentation
 package runtime
 
diff --git a/base/runtime/heap_allocator_unix.odin b/base/runtime/heap_allocator_unix.odin
index bfbbb5303..2b6698885 100644
--- a/base/runtime/heap_allocator_unix.odin
+++ b/base/runtime/heap_allocator_unix.odin
@@ -1,4 +1,4 @@
-//+build linux, darwin, freebsd, openbsd
+//+build linux, darwin, freebsd, openbsd, haiku
 //+private
 package runtime
 
@@ -35,4 +35,4 @@ _heap_resize :: proc(ptr: rawptr, new_size: int) -> rawptr {
 
 _heap_free :: proc(ptr: rawptr) {
 	_unix_free(ptr)
-}
\ No newline at end of file
+}
diff --git a/base/runtime/os_specific_haiku.odin b/base/runtime/os_specific_haiku.odin
new file mode 100644
index 000000000..f8dafac3d
--- /dev/null
+++ b/base/runtime/os_specific_haiku.odin
@@ -0,0 +1,21 @@
+//+build haiku
+//+private
+package runtime
+
+foreign import libc "system:c"
+
+foreign libc {
+	@(link_name="write")
+	_unix_write :: proc(fd: i32, buf: rawptr, size: int) -> int ---
+
+	_errnop :: proc() -> ^i32 ---
+}
+
+_stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
+	ret := _unix_write(2, raw_data(data), len(data))
+	if ret < len(data) {
+		err := _errnop()
+		return int(ret), _OS_Errno(err^ if err != nil else 0)
+	}
+	return int(ret), 0
+}
diff --git a/build_odin.sh b/build_odin.sh
index fab6c5fd1..93319b4ef 100755
--- a/build_odin.sh
+++ b/build_odin.sh
@@ -82,6 +82,11 @@ OpenBSD)
 	LDFLAGS="$LDFLAGS -liconv"
 	LDFLAGS="$LDFLAGS $($LLVM_CONFIG --libs core native --system-libs)"
 	;;
+Haiku)
+	CXXFLAGS="$CXXFLAGS $($LLVM_CONFIG --cxxflags --ldflags) -I/system/develop/headers/private/shared -I/system/develop/headers/private/kernel"
+	LDFLAGS="$LDFLAGS -liconv"
+	LDFLAGS="$LDFLAGS $($LLVM_CONFIG --libs core native --system-libs)"
+	;;
 *)
 	error "Platform \"$OS_NAME\" unsupported"
 	;;
diff --git a/core/c/libc/errno.odin b/core/c/libc/errno.odin
index fe6fbb073..7af763706 100644
--- a/core/c/libc/errno.odin
+++ b/core/c/libc/errno.odin
@@ -80,6 +80,24 @@ when ODIN_OS == .Darwin {
 	ERANGE :: 34
 }
 
+when ODIN_OS == .Haiku {
+	@(private="file")
+	@(default_calling_convention="c")
+	foreign libc {
+		@(link_name="_errnop")
+		_get_errno :: proc() -> ^int ---
+	}
+
+	@(private="file")
+	B_GENERAL_ERROR_BASE :: min(i32)
+	@(private="file")
+	B_POSIX_ERROR_BASE   :: B_GENERAL_ERROR_BASE + 0x7000
+
+	EDOM   :: B_POSIX_ERROR_BASE + 16
+	EILSEQ :: B_POSIX_ERROR_BASE + 38
+	ERANGE :: B_POSIX_ERROR_BASE + 17
+}
+
 // Odin has no way to make an identifier "errno" behave as a function call to
 // read the value, or to produce an lvalue such that you can assign a different
 // error value to errno. To work around this, just expose it as a function like
diff --git a/core/c/libc/stdio.odin b/core/c/libc/stdio.odin
index 39969e4a8..b83ddecc8 100644
--- a/core/c/libc/stdio.odin
+++ b/core/c/libc/stdio.odin
@@ -163,6 +163,36 @@ when ODIN_OS == .Darwin {
 	}
 }
 
+when ODIN_OS == .Haiku {
+	fpos_t :: distinct i64
+	
+	_IOFBF        :: 0
+	_IOLBF        :: 1
+	_IONBF        :: 2
+
+	BUFSIZ        :: 8192
+
+	EOF           :: int(-1)
+
+	FOPEN_MAX     :: 128
+
+	FILENAME_MAX  :: 256
+
+	L_tmpnam      :: 512
+
+	SEEK_SET      :: 0
+	SEEK_CUR      :: 1
+	SEEK_END      :: 2
+
+	TMP_MAX       :: 32768
+
+	foreign libc {
+		stderr: ^FILE
+		stdin:  ^FILE
+		stdout: ^FILE
+	}
+}
+
 @(default_calling_convention="c")
 foreign libc {
 	// 7.21.4 Operations on files
diff --git a/core/c/libc/time.odin b/core/c/libc/time.odin
index 72b899546..4c4280f30 100644
--- a/core/c/libc/time.odin
+++ b/core/c/libc/time.odin
@@ -45,7 +45,7 @@ when ODIN_OS == .Windows {
 	}
 }
 
-when ODIN_OS == .Linux || ODIN_OS == .FreeBSD || ODIN_OS == .Darwin || ODIN_OS == .OpenBSD {
+when ODIN_OS == .Linux || ODIN_OS == .FreeBSD || ODIN_OS == .Darwin || ODIN_OS == .OpenBSD || ODIN_OS == .Haiku {
 	@(default_calling_convention="c")
 	foreign libc {
 		// 7.27.2 Time manipulation functions
diff --git a/core/c/libc/wctype.odin b/core/c/libc/wctype.odin
index 43aee9dc6..cbce220d4 100644
--- a/core/c/libc/wctype.odin
+++ b/core/c/libc/wctype.odin
@@ -29,7 +29,11 @@ when ODIN_OS == .Windows {
 } else when ODIN_OS == .FreeBSD {
 	wctrans_t :: distinct int
 	wctype_t  :: distinct ulong
-	
+
+} else when ODIN_OS == .Haiku {
+	wctrans_t :: distinct i32
+	wctype_t  :: distinct i32
+
 }
 
 @(default_calling_convention="c")
diff --git a/core/fmt/fmt.odin b/core/fmt/fmt.odin
index e3e7a2bb5..02803f882 100644
--- a/core/fmt/fmt.odin
+++ b/core/fmt/fmt.odin
@@ -2814,10 +2814,10 @@ fmt_value :: proc(fi: ^Info, v: any, verb: rune) {
 					value := runtime.map_cell_index_dynamic(vs, info.map_info.vs, bucket_index)
 
 					fmt_arg(&Info{writer = fi.writer}, any{rawptr(key), info.key.id}, verb)
-					if verb == 'v' {
-						io.write_string(fi.writer, "=", &fi.n)
-					} else {
+					if hash {
 						io.write_string(fi.writer, " = ", &fi.n)
+					} else {
+						io.write_string(fi.writer, "=", &fi.n)
 					}
 					fmt_arg(fi, any{rawptr(value), info.value.id}, verb)
 
diff --git a/core/math/big/internal.odin b/core/math/big/internal.odin
index 35c95f465..03623e7f2 100644
--- a/core/math/big/internal.odin
+++ b/core/math/big/internal.odin
@@ -1181,28 +1181,18 @@ internal_cmp_digit :: internal_compare_digit
 */
 internal_int_compare_magnitude :: #force_inline proc(a, b: ^Int) -> (comparison: int) {
 	assert_if_nil(a, b)
-	/*
-		Compare based on used digits.
-	*/
+
+	// Compare based on used digits.
 	if a.used != b.used {
-		if a.used > b.used {
-			return +1
-		}
-		return -1
+		return +1 if a.used > b.used else -1
 	}
 
-	/*
-		Same number of used digits, compare based on their value.
-	*/
+	// Same number of used digits, compare based on their value.
 	#no_bounds_check for n := a.used - 1; n >= 0; n -= 1 {
 		if a.digit[n] != b.digit[n] {
-			if a.digit[n] > b.digit[n] {
-				return +1
-			}
-			return -1
+			return +1 if a.digit[n] > b.digit[n] else -1
 		}
 	}
-
 	return 0
 }
 internal_compare_magnitude :: proc { internal_int_compare_magnitude, }
diff --git a/core/math/big/private.odin b/core/math/big/private.odin
index d045b4239..2ee6cfafa 100644
--- a/core/math/big/private.odin
+++ b/core/math/big/private.odin
@@ -1,3402 +1,3355 @@
-/*
-	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
-	Made available under Odin's BSD-3 license.
-
-	An arbitrary precision mathematics implementation in Odin.
-	For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
-	The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
-
-	=============================    Private procedures    =============================
-
-	Private procedures used by the above low-level routines follow.
-
-	Don't call these yourself unless you really know what you're doing.
-	They include implementations that are optimimal for certain ranges of input only.
-
-	These aren't exported for the same reasons.
-*/
-
-
-package math_big
-
-import "base:intrinsics"
-import "core:mem"
-
-/*
-	Multiplies |a| * |b| and only computes upto digs digits of result.
-	HAC pp. 595, Algorithm 14.12  Modified so you can control how
-	many digits of output are created.
-*/
-_private_int_mul :: proc(dest, a, b: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-
-	/*
-		Can we use the fast multiplier?
-	*/
-	if digits < _WARRAY && min(a.used, b.used) < _MAX_COMBA {
-		return #force_inline _private_int_mul_comba(dest, a, b, digits)
-	}
-
-	/*
-		Set up temporary output `Int`, which we'll swap for `dest` when done.
-	*/
-
-	t := &Int{}
-
-	internal_grow(t, max(digits, _DEFAULT_DIGIT_COUNT)) or_return
-	t.used = digits
-
-	/*
-		Compute the digits of the product directly.
-	*/
-	pa := a.used
-	for ix := 0; ix < pa; ix += 1 {
-		/*
-			Limit ourselves to `digits` DIGITs of output.
-		*/
-		pb    := min(b.used, digits - ix)
-		carry := _WORD(0)
-		iy    := 0
-
-		/*
-			Compute the column of the output and propagate the carry.
-		*/
-		#no_bounds_check for iy = 0; iy < pb; iy += 1 {
-			/*
-				Compute the column as a _WORD.
-			*/
-			column := _WORD(t.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + carry
-
-			/*
-				The new column is the lower part of the result.
-			*/
-			t.digit[ix + iy] = DIGIT(column & _WORD(_MASK))
-
-			/*
-				Get the carry word from the result.
-			*/
-			carry = column >> _DIGIT_BITS
-		}
-		/*
-			Set carry if it is placed below digits
-		*/
-		if ix + iy < digits {
-			t.digit[ix + pb] = DIGIT(carry)
-		}
-	}
-
-	internal_swap(dest, t)
-	internal_destroy(t)
-	return internal_clamp(dest)
-}
-
-
-/*
-	Multiplication using the Toom-Cook 3-way algorithm.
-
-	Much more complicated than Karatsuba but has a lower asymptotic running time of O(N**1.464).
-	This algorithm is only particularly useful on VERY large inputs.
-	(We're talking 1000s of digits here...).
-
-	This file contains code from J. Arndt's book  "Matters Computational"
-	and the accompanying FXT-library with permission of the author.
-
-	Setup from:
-		Chung, Jaewook, and M. Anwar Hasan. "Asymmetric squaring formulae."
-		18th IEEE Symposium on Computer Arithmetic (ARITH'07). IEEE, 2007.
-
-	The interpolation from above needed one temporary variable more than the interpolation here:
-
-		Bodrato, Marco, and Alberto Zanoni. "What about Toom-Cook matrices optimality."
-		Centro Vito Volterra Universita di Roma Tor Vergata (2006)
-*/
-_private_int_mul_toom :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-
-	S1, S2, T1, a0, a1, a2, b0, b1, b2 := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
-	defer internal_destroy(S1, S2, T1, a0, a1, a2, b0, b1, b2)
-
-	/*
-		Init temps.
-	*/
-	internal_init_multi(S1, S2, T1)             or_return
-
-	/*
-		B
-	*/
-	B := min(a.used, b.used) / 3
-
-	/*
-		a = a2 * x^2 + a1 * x + a0;
-	*/
-	internal_grow(a0, B)                        or_return
-	internal_grow(a1, B)                        or_return
-	internal_grow(a2, a.used - 2 * B)           or_return
-
-	a0.used, a1.used = B, B
-	a2.used = a.used - 2 * B
-
-	internal_copy_digits(a0, a, a0.used)        or_return
-	internal_copy_digits(a1, a, a1.used, B)     or_return
-	internal_copy_digits(a2, a, a2.used, 2 * B) or_return
-
-	internal_clamp(a0)
-	internal_clamp(a1)
-	internal_clamp(a2)
-
-	/*
-		b = b2 * x^2 + b1 * x + b0;
-	*/
-	internal_grow(b0, B)                        or_return
-	internal_grow(b1, B)                        or_return
-	internal_grow(b2, b.used - 2 * B)           or_return
-
-	b0.used, b1.used = B, B
-	b2.used = b.used - 2 * B
-
-	internal_copy_digits(b0, b, b0.used)        or_return
-	internal_copy_digits(b1, b, b1.used, B)     or_return
-	internal_copy_digits(b2, b, b2.used, 2 * B) or_return
-
-	internal_clamp(b0)
-	internal_clamp(b1)
-	internal_clamp(b2)
-
-
-	/*
-		\\ S1 = (a2+a1+a0) * (b2+b1+b0);
-	*/
-	internal_add(T1, a2, a1)                    or_return /*   T1 = a2 + a1; */
-	internal_add(S2, T1, a0)                    or_return /*   S2 = T1 + a0; */
-	internal_add(dest, b2, b1)                  or_return /* dest = b2 + b1; */
-	internal_add(S1, dest, b0)                  or_return /*   S1 =  c + b0; */
-	internal_mul(S1, S1, S2)                    or_return /*   S1 = S1 * S2; */
-
-	/*
-		\\S2 = (4*a2+2*a1+a0) * (4*b2+2*b1+b0);
-	*/
-	internal_add(T1, T1, a2)                    or_return /*   T1 = T1 + a2; */
-	internal_int_shl1(T1, T1)                   or_return /*   T1 = T1 << 1; */
-	internal_add(T1, T1, a0)                    or_return /*   T1 = T1 + a0; */
-	internal_add(dest, dest, b2)                or_return /*    c =  c + b2; */
-	internal_int_shl1(dest, dest)               or_return /*    c =  c << 1; */
-	internal_add(dest, dest, b0)                or_return /*    c =  c + b0; */
-	internal_mul(S2, T1, dest)                  or_return /*   S2 = T1 *  c; */
-
-	/*
-		\\S3 = (a2-a1+a0) * (b2-b1+b0);
-	*/
-	internal_sub(a1, a2, a1)                    or_return /*   a1 = a2 - a1; */
-	internal_add(a1, a1, a0)                    or_return /*   a1 = a1 + a0; */
-	internal_sub(b1, b2, b1)                    or_return /*   b1 = b2 - b1; */
-	internal_add(b1, b1, b0)                    or_return /*   b1 = b1 + b0; */
-	internal_mul(a1, a1, b1)                    or_return /*   a1 = a1 * b1; */
-	internal_mul(b1, a2, b2)                    or_return /*   b1 = a2 * b2; */
-
-	/*
-		\\S2 = (S2 - S3) / 3;
-	*/
-	internal_sub(S2, S2, a1)                    or_return /*   S2 = S2 - a1; */
-	_private_int_div_3(S2, S2)                  or_return /*   S2 = S2 / 3; \\ this is an exact division  */
-	internal_sub(a1, S1, a1)                    or_return /*   a1 = S1 - a1; */
-	internal_int_shr1(a1, a1)                   or_return /*   a1 = a1 >> 1; */
-	internal_mul(a0, a0, b0)                    or_return /*   a0 = a0 * b0; */
-	internal_sub(S1, S1, a0)                    or_return /*   S1 = S1 - a0; */
-	internal_sub(S2, S2, S1)                    or_return /*   S2 = S2 - S1; */
-	internal_int_shr1(S2, S2)                   or_return /*   S2 = S2 >> 1; */
-	internal_sub(S1, S1, a1)                    or_return /*   S1 = S1 - a1; */
-	internal_sub(S1, S1, b1)                    or_return /*   S1 = S1 - b1; */
-	internal_int_shl1(T1, b1)                   or_return /*   T1 = b1 << 1; */
-	internal_sub(S2, S2, T1)                    or_return /*   S2 = S2 - T1; */
-	internal_sub(a1, a1, S2)                    or_return /*   a1 = a1 - S2; */
-
-	/*
-		P = b1*x^4+ S2*x^3+ S1*x^2+ a1*x + a0;
-	*/
-	_private_int_shl_leg(b1, 4 * B)             or_return
-	_private_int_shl_leg(S2, 3 * B)             or_return
-	internal_add(b1, b1, S2)                    or_return
-	_private_int_shl_leg(S1, 2 * B)             or_return
-	internal_add(b1, b1, S1)                    or_return
-	_private_int_shl_leg(a1, 1 * B)             or_return
-	internal_add(b1, b1, a1)                    or_return
-	internal_add(dest, b1, a0)                  or_return
-
-	/*
-		a * b - P
-	*/
-	return nil
-}
-
-/*
-	product = |a| * |b| using Karatsuba Multiplication using three half size multiplications.
-
-	Let `B` represent the radix [e.g. 2**_DIGIT_BITS] and let `n` represent
-	half of the number of digits in the min(a,b)
-
-	`a` = `a1` * `B`**`n` + `a0`
-	`b` = `b`1 * `B`**`n` + `b0`
-
-	Then, a * b => 1b1 * B**2n + ((a1 + a0)(b1 + b0) - (a0b0 + a1b1)) * B + a0b0
-
-	Note that a1b1 and a0b0 are used twice and only need to be computed once.
-	So in total three half size (half # of digit) multiplications are performed,
-		a0b0, a1b1 and (a1+b1)(a0+b0)
-
-	Note that a multiplication of half the digits requires 1/4th the number of
-	single precision multiplications, so in total after one call 25% of the
-	single precision multiplications are saved.
-
-	Note also that the call to `internal_mul` can end up back in this function
-	if the a0, a1, b0, or b1 are above the threshold.
-
-	This is known as divide-and-conquer and leads to the famous O(N**lg(3)) or O(N**1.584)
-	work which is asymptopically lower than the standard O(N**2) that the
-	baseline/comba methods use. Generally though, the overhead of this method doesn't pay off
-	until a certain size is reached, of around 80 used DIGITs.
-*/
-_private_int_mul_karatsuba :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-
-	x0, x1, y0, y1, t1, x0y0, x1y1 := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
-	defer internal_destroy(x0, x1, y0, y1, t1, x0y0, x1y1)
-
-	/*
-		min # of digits, divided by two.
-	*/
-	B := min(a.used, b.used) >> 1
-
-	/*
-		Init all the temps.
-	*/
-	internal_grow(x0, B)          or_return
-	internal_grow(x1, a.used - B) or_return
-	internal_grow(y0, B)          or_return
-	internal_grow(y1, b.used - B) or_return
-	internal_grow(t1, B * 2)      or_return
-	internal_grow(x0y0, B * 2)    or_return
-	internal_grow(x1y1, B * 2)    or_return
-
-	/*
-		Now shift the digits.
-	*/
-	x0.used, y0.used = B, B
-	x1.used = a.used - B
-	y1.used = b.used - B
-
-	/*
-		We copy the digits directly instead of using higher level functions
-		since we also need to shift the digits.
-	*/
-	internal_copy_digits(x0, a, x0.used)
-	internal_copy_digits(y0, b, y0.used)
-	internal_copy_digits(x1, a, x1.used, B)
-	internal_copy_digits(y1, b, y1.used, B)
-
-	/*
-		Only need to clamp the lower words since by definition the
-		upper words x1/y1 must have a known number of digits.
-	*/
-	clamp(x0)
-	clamp(y0)
-
-	/*
-		Now calc the products x0y0 and x1y1,
-		after this x0 is no longer required, free temp [x0==t2]!
-	*/
-	internal_mul(x0y0, x0, y0)      or_return /* x0y0 = x0*y0 */
-	internal_mul(x1y1, x1, y1)      or_return /* x1y1 = x1*y1 */
-	internal_add(t1,   x1, x0)      or_return /* now calc x1+x0 and */
-	internal_add(x0,   y1, y0)      or_return /* t2 = y1 + y0 */
-	internal_mul(t1,   t1, x0)      or_return /* t1 = (x1 + x0) * (y1 + y0) */
-
-	/*
-		Add x0y0.
-	*/
-	internal_add(x0, x0y0, x1y1)    or_return /* t2 = x0y0 + x1y1 */
-	internal_sub(t1,   t1,   x0)    or_return /* t1 = (x1+x0)*(y1+y0) - (x1y1 + x0y0) */
-
-	/*
-		shift by B.
-	*/
-	_private_int_shl_leg(t1, B)       or_return /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
-	_private_int_shl_leg(x1y1, B * 2) or_return /* x1y1 = x1y1 << 2*B */
-
-	internal_add(t1, x0y0, t1)      or_return /* t1 = x0y0 + t1 */
-	internal_add(dest, t1, x1y1)    or_return /* t1 = x0y0 + t1 + x1y1 */
-
-	return nil
-}
-
-
-
-/*
-	Fast (comba) multiplier
-
-	This is the fast column-array [comba] multiplier.  It is
-	designed to compute the columns of the product first
-	then handle the carries afterwards.  This has the effect
-	of making the nested loops that compute the columns very
-	simple and schedulable on super-scalar processors.
-
-	This has been modified to produce a variable number of
-	digits of output so if say only a half-product is required
-	you don't have to compute the upper half (a feature
-	required for fast Barrett reduction).
-
-	Based on Algorithm 14.12 on pp.595 of HAC.
-*/
-_private_int_mul_comba :: proc(dest, a, b: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-
-	/*
-		Set up array.
-	*/
-	W: [_WARRAY]DIGIT = ---
-
-	/*
-		Grow the destination as required.
-	*/
-	internal_grow(dest, digits) or_return
-
-	/*
-		Number of output digits to produce.
-	*/
-	pa := min(digits, a.used + b.used)
-
-	/*
-		Clear the carry
-	*/
-	_W := _WORD(0)
-
-	ix: int
-	for ix = 0; ix < pa; ix += 1 {
-		tx, ty, iy, iz: int
-
-		/*
-			Get offsets into the two bignums.
-		*/
-		ty = min(b.used - 1, ix)
-		tx = ix - ty
-
-		/*
-			This is the number of times the loop will iterate, essentially.
-			while (tx++ < a->used && ty-- >= 0) { ... }
-		*/
-		 
-		iy = min(a.used - tx, ty + 1)
-
-		/*
-			Execute loop.
-		*/
-		#no_bounds_check for iz = 0; iz < iy; iz += 1 {
-			_W += _WORD(a.digit[tx + iz]) * _WORD(b.digit[ty - iz])
-		}
-
-		/*
-			Store term.
-		*/
-		W[ix] = DIGIT(_W) & _MASK
-
-		/*
-			Make next carry.
-		*/
-		_W = _W >> _WORD(_DIGIT_BITS)
-	}
-
-	/*
-		Setup dest.
-	*/
-	old_used := dest.used
-	dest.used = pa
-
-	/*
-		Now extract the previous digit [below the carry].
-	*/
-	copy_slice(dest.digit[0:], W[:pa])	
-
-	/*
-		Clear unused digits [that existed in the old copy of dest].
-	*/
-	internal_zero_unused(dest, old_used)
-
-	/*
-		Adjust dest.used based on leading zeroes.
-	*/
-
-	return internal_clamp(dest)
-}
-
-/*
-	Multiplies |a| * |b| and does not compute the lower digs digits
-	[meant to get the higher part of the product]
-*/
-_private_int_mul_high :: proc(dest, a, b: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-
-	/*
-		Can we use the fast multiplier?
-	*/
-	if a.used + b.used + 1 < _WARRAY && min(a.used, b.used) < _MAX_COMBA {
-		return _private_int_mul_high_comba(dest, a, b, digits)
-	}
-
-	internal_grow(dest, a.used + b.used + 1) or_return
-	dest.used = a.used + b.used + 1
-
-	pa := a.used
-	pb := b.used
-	for ix := 0; ix < pa; ix += 1 {
-		carry := DIGIT(0)
-
-		for iy := digits - ix; iy < pb; iy += 1 {
-			/*
-				Calculate the double precision result.
-			*/
-			r := _WORD(dest.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + _WORD(carry)
-
-			/*
-				Get the lower part.
-			*/
-			dest.digit[ix + iy] = DIGIT(r & _WORD(_MASK))
-
-			/*
-				Carry the carry.
-			*/
-			carry = DIGIT(r >> _WORD(_DIGIT_BITS))
-		}
-		dest.digit[ix + pb] = carry
-	}
-	return internal_clamp(dest)
-}
-
-/*
-	This is a modified version of `_private_int_mul_comba` that only produces output digits *above* `digits`.
-	See the comments for `_private_int_mul_comba` to see how it works.
-
-	This is used in the Barrett reduction since for one of the multiplications
-	only the higher digits were needed.  This essentially halves the work.
-
-	Based on Algorithm 14.12 on pp.595 of HAC.
-*/
-_private_int_mul_high_comba :: proc(dest, a, b: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-
-	W: [_WARRAY]DIGIT = ---
-	_W: _WORD = 0
-
-	/*
-		Number of output digits to produce. Grow the destination as required.
-	*/
-	pa := a.used + b.used
-	internal_grow(dest, pa) or_return
-
-	ix: int
-	for ix = digits; ix < pa; ix += 1 {
-		/*
-			Get offsets into the two bignums.
-		*/
-		ty := min(b.used - 1, ix)
-		tx := ix - ty
-
-		/*
-			This is the number of times the loop will iterrate, essentially it's
-			while (tx++ < a->used && ty-- >= 0) { ... }
-		*/
-		iy := min(a.used - tx, ty + 1)
-
-		/*
-			Execute loop.
-		*/
-		for iz := 0; iz < iy; iz += 1 {
-			_W += _WORD(a.digit[tx + iz]) * _WORD(b.digit[ty - iz])
-		}
-
-		/*
-			Store term.
-		*/
-		W[ix] = DIGIT(_W) & DIGIT(_MASK)
-
-		/*
-			Make next carry.
-		*/
-		_W = _W >> _WORD(_DIGIT_BITS)
-	}
-
-	/*
-		Setup dest
-	*/
-	old_used := dest.used
-	dest.used = pa
-
-	for ix = digits; ix < pa; ix += 1 {
-		/*
-			Now extract the previous digit [below the carry].
-		*/
-		dest.digit[ix] = W[ix]
-	}
-
-	/*
-		Zero remainder.
-	*/
-	internal_zero_unused(dest, old_used)
-
-	/*
-		Adjust dest.used based on leading zeroes.
-	*/
-	return internal_clamp(dest)
-}
-
-/*
-	Single-digit multiplication with the smaller number as the single-digit.
-*/
-_private_int_mul_balance :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-	a, b := a, b
-
-	a0, tmp, r := &Int{}, &Int{}, &Int{}
-	defer internal_destroy(a0, tmp, r)
-
-	b_size   := min(a.used, b.used)
-	n_blocks := max(a.used, b.used) / b_size
-
-	internal_grow(a0, b_size + 2) or_return
-	internal_init_multi(tmp, r)   or_return
-
-	/*
-		Make sure that `a` is the larger one.
-	*/
-	if a.used < b.used {
-		a, b = b, a
-	}
-	assert(a.used >= b.used)
-
-	i, j := 0, 0
-	for ; i < n_blocks; i += 1 {
-		/*
-			Cut a slice off of `a`.
-		*/
-
-		a0.used = b_size
-		internal_copy_digits(a0, a, a0.used, j)
-		j += a0.used
-		internal_clamp(a0)
-
-		/*
-			Multiply with `b`.
-		*/
-		internal_mul(tmp, a0, b)                                     or_return
-
-		/*
-			Shift `tmp` to the correct position.
-		*/
-		_private_int_shl_leg(tmp, b_size * i)                          or_return
-
-		/*
-			Add to output. No carry needed.
-		*/
-		internal_add(r, r, tmp)                                      or_return
-	}
-
-	/*
-		The left-overs; there are always left-overs.
-	*/
-	if j < a.used {
-		a0.used = a.used - j
-		internal_copy_digits(a0, a, a0.used, j)
-		j += a0.used
-		internal_clamp(a0)
-
-		internal_mul(tmp, a0, b)                                     or_return
-		_private_int_shl_leg(tmp, b_size * i)                          or_return
-		internal_add(r, r, tmp)                                      or_return
-	}
-
-	internal_swap(dest, r)
-	return
-}
-
-/*
-	Low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16
-	Assumes `dest` and `src` to not be `nil`, and `src` to have been initialized.
-*/
-_private_int_sqr :: proc(dest, src: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-	pa := src.used
-
-	t := &Int{}; ix, iy: int
-	/*
-		Grow `t` to maximum needed size, or `_DEFAULT_DIGIT_COUNT`, whichever is bigger.
-	*/
-	internal_grow(t, max((2 * pa) + 1, _DEFAULT_DIGIT_COUNT)) or_return
-	t.used = (2 * pa) + 1
-
-	#no_bounds_check for ix = 0; ix < pa; ix += 1 {
-		carry := DIGIT(0)
-		/*
-			First calculate the digit at 2*ix; calculate double precision result.
-		*/
-		r := _WORD(t.digit[ix+ix]) + (_WORD(src.digit[ix]) * _WORD(src.digit[ix]))
-
-		/*
-			Store lower part in result.
-		*/
-		t.digit[ix+ix] = DIGIT(r & _WORD(_MASK))
-		/*
-			Get the carry.
-		*/
-		carry = DIGIT(r >> _DIGIT_BITS)
-
-		#no_bounds_check for iy = ix + 1; iy < pa; iy += 1 {
-			/*
-				First calculate the product.
-			*/
-			r = _WORD(src.digit[ix]) * _WORD(src.digit[iy])
-
-			/* Now calculate the double precision result. Nóte we use
-			 * addition instead of *2 since it's easier to optimize
-			 */
-			r = _WORD(t.digit[ix+iy]) + r + r + _WORD(carry)
-
-			/*
-				Store lower part.
-			*/
-			t.digit[ix+iy] = DIGIT(r & _WORD(_MASK))
-
-			/*
-				Get carry.
-			*/
-			carry = DIGIT(r >> _DIGIT_BITS)
-		}
-		/*
-			Propagate upwards.
-		*/
-		#no_bounds_check for carry != 0 {
-			r     = _WORD(t.digit[ix+iy]) + _WORD(carry)
-			t.digit[ix+iy] = DIGIT(r & _WORD(_MASK))
-			carry = DIGIT(r >> _WORD(_DIGIT_BITS))
-			iy += 1
-		}
-	}
-
-	err = internal_clamp(t)
-	internal_swap(dest, t)
-	internal_destroy(t)
-	return err
-}
-
-/*
-	The jist of squaring...
-	You do like mult except the offset of the tmpx [one that starts closer to zero] can't equal the offset of tmpy.
-	So basically you set up iy like before then you min it with (ty-tx) so that it never happens.
-	You double all those you add in the inner loop. After that loop you do the squares and add them in.
-
-	Assumes `dest` and `src` not to be `nil` and `src` to have been initialized.	
-*/
-_private_int_sqr_comba :: proc(dest, src: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-
-	W: [_WARRAY]DIGIT = ---
-
-	/*
-		Grow the destination as required.
-	*/
-	pa := uint(src.used) + uint(src.used)
-	internal_grow(dest, int(pa)) or_return
-
-	/*
-		Number of output digits to produce.
-	*/
-	W1 := _WORD(0)
-	_W  : _WORD = ---
-	ix := uint(0)
-
-	#no_bounds_check for ; ix < pa; ix += 1 {
-		/*
-			Clear counter.
-		*/
-		_W = {}
-
-		/*
-			Get offsets into the two bignums.
-		*/
-		ty := min(uint(src.used) - 1, ix)
-		tx := ix - ty
-
-		/*
-			This is the number of times the loop will iterate,
-			essentially while (tx++ < a->used && ty-- >= 0) { ... }
-		*/
-		iy := min(uint(src.used) - tx, ty + 1)
-
-		/*
-			Now for squaring, tx can never equal ty.
-			We halve the distance since they approach at a rate of 2x,
-			and we have to round because odd cases need to be executed.
-		*/
-		iy = min(iy, ((ty - tx) + 1) >> 1 )
-
-		/*
-			Execute loop.
-		*/
-		#no_bounds_check for iz := uint(0); iz < iy; iz += 1 {
-			_W += _WORD(src.digit[tx + iz]) * _WORD(src.digit[ty - iz])
-		}
-
-		/*
-			Double the inner product and add carry.
-		*/
-		_W = _W + _W + W1
-
-		/*
-			Even columns have the square term in them.
-		*/
-		if ix & 1 == 0 {
-			_W += _WORD(src.digit[ix >> 1]) * _WORD(src.digit[ix >> 1])
-		}
-
-		/*
-			Store it.
-		*/
-		W[ix] = DIGIT(_W & _WORD(_MASK))
-
-		/*
-			Make next carry.
-		*/
-		W1 = _W >> _DIGIT_BITS
-	}
-
-	/*
-		Setup dest.
-	*/
-	old_used := dest.used
-	dest.used = src.used + src.used
-
-	#no_bounds_check for ix = 0; ix < pa; ix += 1 {
-		dest.digit[ix] = W[ix] & _MASK
-	}
-
-	/*
-		Clear unused digits [that existed in the old copy of dest].
-	*/
-	internal_zero_unused(dest, old_used)
-
-	return internal_clamp(dest)
-}
-
-/*
-	Karatsuba squaring, computes `dest` = `src` * `src` using three half-size squarings.
- 
- 	See comments of `_private_int_mul_karatsuba` for details.
- 	It is essentially the same algorithm but merely tuned to perform recursive squarings.
-*/
-_private_int_sqr_karatsuba :: proc(dest, src: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-
-	x0, x1, t1, t2, x0x0, x1x1 := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
-	defer internal_destroy(x0, x1, t1, t2, x0x0, x1x1)
-
-	/*
-		Min # of digits, divided by two.
-	*/
-	B := src.used >> 1
-
-	/*
-		Init temps.
-	*/
-	internal_grow(x0,   B) or_return
-	internal_grow(x1,   src.used - B) or_return
-	internal_grow(t1,   src.used * 2) or_return
-	internal_grow(t2,   src.used * 2) or_return
-	internal_grow(x0x0, B * 2       ) or_return
-	internal_grow(x1x1, (src.used - B) * 2) or_return
-
-	/*
-		Now shift the digits.
-	*/
-	x0.used = B
-	x1.used = src.used - B
-
-	#force_inline internal_copy_digits(x0, src, x0.used)
-	#force_inline mem.copy_non_overlapping(&x1.digit[0], &src.digit[B], size_of(DIGIT) * x1.used)
-	#force_inline internal_clamp(x0)
-
-	/*
-		Now calc the products x0*x0 and x1*x1.
-	*/
-	internal_sqr(x0x0, x0) or_return
-	internal_sqr(x1x1, x1) or_return
-
-	/*
-		Now calc (x1+x0)^2
-	*/
-	internal_add(t1, x0, x1) or_return
-	internal_sqr(t1, t1) or_return
-
-	/*
-		Add x0y0
-	*/
-	internal_add(t2, x0x0, x1x1) or_return
-	internal_sub(t1, t1, t2) or_return
-
-	/*
-		Shift by B.
-	*/
-	_private_int_shl_leg(t1, B) or_return
-	_private_int_shl_leg(x1x1, B * 2) or_return
-	internal_add(t1, t1, x0x0) or_return
-	internal_add(dest, t1, x1x1) or_return
-
-	return #force_inline internal_clamp(dest)
-}
-
-/*
-	Squaring using Toom-Cook 3-way algorithm.
-
-	Setup and interpolation from algorithm SQR_3 in Chung, Jaewook, and M. Anwar Hasan. "Asymmetric squaring formulae."
-	  18th IEEE Symposium on Computer Arithmetic (ARITH'07). IEEE, 2007.
-*/
-_private_int_sqr_toom :: proc(dest, src: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-
-	S0, a0, a1, a2 := &Int{}, &Int{}, &Int{}, &Int{}
-	defer internal_destroy(S0, a0, a1, a2)
-
-	/*
-		Init temps.
-	*/
-	internal_zero(S0) or_return
-
-	/*
-		B
-	*/
-	B := src.used / 3
-
-	/*
-		a = a2 * x^2 + a1 * x + a0;
-	*/
-	internal_grow(a0, B) or_return
-	internal_grow(a1, B) or_return
-	internal_grow(a2, src.used - (2 * B)) or_return
-
-	a0.used = B
-	a1.used = B
-	a2.used = src.used - 2 * B
-
-	#force_inline mem.copy_non_overlapping(&a0.digit[0], &src.digit[    0], size_of(DIGIT) * a0.used)
-	#force_inline mem.copy_non_overlapping(&a1.digit[0], &src.digit[    B], size_of(DIGIT) * a1.used)
-	#force_inline mem.copy_non_overlapping(&a2.digit[0], &src.digit[2 * B], size_of(DIGIT) * a2.used)
-
-	internal_clamp(a0)
-	internal_clamp(a1)
-	internal_clamp(a2)
-
-	/** S0 = a0^2;  */
-	internal_sqr(S0, a0) or_return
-
-	/** \\S1 = (a2 + a1 + a0)^2 */
-	/** \\S2 = (a2 - a1 + a0)^2  */
-	/** \\S1 = a0 + a2; */
-	/** a0 = a0 + a2; */
-	internal_add(a0, a0, a2) or_return
-	/** \\S2 = S1 - a1; */
-	/** b = a0 - a1; */
-	internal_sub(dest, a0, a1) or_return
-	/** \\S1 = S1 + a1; */
-	/** a0 = a0 + a1; */
-	internal_add(a0, a0, a1) or_return
-	/** \\S1 = S1^2;  */
-	/** a0 = a0^2; */
-	internal_sqr(a0, a0) or_return
-	/** \\S2 = S2^2;  */
-	/** b = b^2; */
-	internal_sqr(dest, dest) or_return
-	/** \\ S3 = 2 * a1 * a2  */
-	/** \\S3 = a1 * a2;  */
-	/** a1 = a1 * a2; */
-	internal_mul(a1, a1, a2) or_return
-	/** \\S3 = S3 << 1;  */
-	/** a1 = a1 << 1; */
-	internal_shl(a1, a1, 1) or_return
-	/** \\S4 = a2^2;  */
-	/** a2 = a2^2; */
-	internal_sqr(a2, a2) or_return
-	/** \\ tmp = (S1 + S2)/2  */
-	/** \\tmp = S1 + S2; */
-	/** b = a0 + b; */
-	internal_add(dest, a0, dest) or_return
-	/** \\tmp = tmp >> 1; */
-	/** b = b >> 1; */
-	internal_shr(dest, dest, 1) or_return
-	/** \\ S1 = S1 - tmp - S3  */
-	/** \\S1 = S1 - tmp; */
-	/** a0 = a0 - b; */
-	internal_sub(a0, a0, dest) or_return
-	/** \\S1 = S1 - S3;  */
-	/** a0 = a0 - a1; */
-	internal_sub(a0, a0, a1) or_return
-	/** \\S2 = tmp - S4 -S0  */
-	/** \\S2 = tmp - S4;  */
-	/** b = b - a2; */
-	internal_sub(dest, dest, a2) or_return
-	/** \\S2 = S2 - S0;  */
-	/** b = b - S0; */
-	internal_sub(dest, dest, S0) or_return
-	/** \\P = S4*x^4 + S3*x^3 + S2*x^2 + S1*x + S0; */
-	/** P = a2*x^4 + a1*x^3 + b*x^2 + a0*x + S0; */
-	_private_int_shl_leg(  a2, 4 * B) or_return
-	_private_int_shl_leg(  a1, 3 * B) or_return
-	_private_int_shl_leg(dest, 2 * B) or_return
-	_private_int_shl_leg(  a0, 1 * B) or_return
-
-	internal_add(a2, a2, a1) or_return
-	internal_add(dest, dest, a2) or_return
-	internal_add(dest, dest, a0) or_return
-	internal_add(dest, dest, S0) or_return
-	/** a^2 - P  */
-
-	return #force_inline internal_clamp(dest)
-}
-
-/*
-	Divide by three (based on routine from MPI and the GMP manual).
-*/
-_private_int_div_3 :: proc(quotient, numerator: ^Int, allocator := context.allocator) -> (remainder: DIGIT, err: Error) {
-	context.allocator = allocator
-
-	/*
-		b = 2^_DIGIT_BITS / 3
-	*/
- 	b := _WORD(1) << _WORD(_DIGIT_BITS) / _WORD(3)
-
-	q := &Int{}
-	internal_grow(q, numerator.used) or_return
-	q.used = numerator.used
-	q.sign = numerator.sign
-
-	w, t: _WORD
-	#no_bounds_check for ix := numerator.used; ix >= 0; ix -= 1 {
-		w = (w << _WORD(_DIGIT_BITS)) | _WORD(numerator.digit[ix])
-		if w >= 3 {
-			/*
-				Multiply w by [1/3].
-			*/
-			t = (w * b) >> _WORD(_DIGIT_BITS)
-
-			/*
-				Now subtract 3 * [w/3] from w, to get the remainder.
-			*/
-			w -= t+t+t
-
-			/*
-				Fixup the remainder as required since the optimization is not exact.
-			*/
-			for w >= 3 {
-				t += 1
-				w -= 3
-			}
-		} else {
-			t = 0
-		}
-		q.digit[ix] = DIGIT(t)
-	}
-	remainder = DIGIT(w)
-
-	/*
-		[optional] store the quotient.
-	*/
-	if quotient != nil {
-		err = clamp(q)
- 		internal_swap(q, quotient)
- 	}
-	internal_destroy(q)
-	return remainder, nil
-}
-
-/*
-	Signed Integer Division
-
-	c*b + d == a [i.e. a/b, c=quotient, d=remainder], HAC pp.598 Algorithm 14.20
-
-	Note that the description in HAC is horribly incomplete.
-	For example, it doesn't consider the case where digits are removed from 'x' in
-	the inner loop.
-
-	It also doesn't consider the case that y has fewer than three digits, etc.
-	The overall algorithm is as described as 14.20 from HAC but fixed to treat these cases.
-*/
-_private_int_div_school :: proc(quotient, remainder, numerator, denominator: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-
-	error_if_immutable(quotient, remainder) or_return
-
-	q, x, y, t1, t2 := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
-	defer internal_destroy(q, x, y, t1, t2)
-
-	internal_grow(q, numerator.used + 2) or_return
-	q.used = numerator.used + 2
-
-	internal_init_multi(t1, t2) or_return
-	internal_copy(x, numerator) or_return
-	internal_copy(y, denominator) or_return
-
-	/*
-		Fix the sign.
-	*/
-	neg   := numerator.sign != denominator.sign
-	x.sign = .Zero_or_Positive
-	y.sign = .Zero_or_Positive
-
-	/*
-		Normalize both x and y, ensure that y >= b/2, [b == 2**MP_DIGIT_BIT]
-	*/
-	norm := internal_count_bits(y) % _DIGIT_BITS
-
-	if norm < _DIGIT_BITS - 1 {
-		norm = (_DIGIT_BITS - 1) - norm
-		internal_shl(x, x, norm) or_return
-		internal_shl(y, y, norm) or_return
-	} else {
-		norm = 0
-	}
-
-	/*
-		Note: HAC does 0 based, so if used==5 then it's 0,1,2,3,4, i.e. use 4
-	*/
-	n := x.used - 1
-	t := y.used - 1
-
-	/*
-		while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} }
-		y = y*b**{n-t}
-	*/
-
-	_private_int_shl_leg(y, n - t) or_return
-
-	gte := internal_gte(x, y)
-	for gte {
-		q.digit[n - t] += 1
-		internal_sub(x, x, y) or_return
-		gte = internal_gte(x, y)
-	}
-
-	/*
-		Reset y by shifting it back down.
-	*/
-	_private_int_shr_leg(y, n - t)
-
-	/*
-		Step 3. for i from n down to (t + 1).
-	*/
-	#no_bounds_check for i := n; i >= (t + 1); i -= 1 {
-		if i > x.used { continue }
-
-		/*
-			step 3.1 if xi == yt then set q{i-t-1} to b-1, otherwise set q{i-t-1} to (xi*b + x{i-1})/yt
-		*/
-		if x.digit[i] == y.digit[t] {
-			q.digit[(i - t) - 1] = 1 << (_DIGIT_BITS - 1)
-		} else {
-
-			tmp := _WORD(x.digit[i]) << _DIGIT_BITS
-			tmp |= _WORD(x.digit[i - 1])
-			tmp /= _WORD(y.digit[t])
-			if tmp > _WORD(_MASK) {
-				tmp = _WORD(_MASK)
-			}
-			q.digit[(i - t) - 1] = DIGIT(tmp & _WORD(_MASK))
-		}
-
-		/* while (q{i-t-1} * (yt * b + y{t-1})) >
-					xi * b**2 + xi-1 * b + xi-2
-
-			do q{i-t-1} -= 1;
-		*/
-
-		iter := 0
-
-		q.digit[(i - t) - 1] = (q.digit[(i - t) - 1] + 1) & _MASK
-		#no_bounds_check for {
-			q.digit[(i - t) - 1] = (q.digit[(i - t) - 1] - 1) & _MASK
-
-			/*
-				Find left hand.
-			*/
-			internal_zero(t1)
-			t1.digit[0] = ((t - 1) < 0) ? 0 : y.digit[t - 1]
-			t1.digit[1] = y.digit[t]
-			t1.used = 2
-			internal_mul(t1, t1, q.digit[(i - t) - 1]) or_return
-
-			/*
-				Find right hand.
-			*/
-			t2.digit[0] = ((i - 2) < 0) ? 0 : x.digit[i - 2]
-			t2.digit[1] = x.digit[i - 1] /* i >= 1 always holds */
-			t2.digit[2] = x.digit[i]
-			t2.used = 3
-
-			if internal_lte(t1, t2) {
-				break
-			}
-			iter += 1; if iter > 100 {
-				return .Max_Iterations_Reached
-			}
-		}
-
-		/*
-			Step 3.3 x = x - q{i-t-1} * y * b**{i-t-1}
-		*/
-		int_mul_digit(t1, y, q.digit[(i - t) - 1]) or_return
-		_private_int_shl_leg(t1, (i - t) - 1) or_return
-		internal_sub(x, x, t1) or_return
-
-		/*
-			if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; }
-		*/
-		if x.sign == .Negative {
-			internal_copy(t1, y) or_return
-			_private_int_shl_leg(t1, (i - t) - 1) or_return
-			internal_add(x, x, t1) or_return
-
-			q.digit[(i - t) - 1] = (q.digit[(i - t) - 1] - 1) & _MASK
-		}
-	}
-
-	/*
-		Now q is the quotient and x is the remainder, [which we have to normalize]
-		Get sign before writing to c.
-	*/
-	z, _ := is_zero(x)
-	x.sign = .Zero_or_Positive if z else numerator.sign
-
-	if quotient != nil {
-		internal_clamp(q)
-		internal_swap(q, quotient)
-		quotient.sign = .Negative if neg else .Zero_or_Positive
-	}
-
-	if remainder != nil {
-		internal_shr(x, x, norm) or_return
-		internal_swap(x, remainder)
-	}
-
-	return nil
-}
-
-/*
-	Direct implementation of algorithms 1.8 "RecursiveDivRem" and 1.9 "UnbalancedDivision" from:
-
-		Brent, Richard P., and Paul Zimmermann. "Modern computer arithmetic"
-		Vol. 18. Cambridge University Press, 2010
-		Available online at https://arxiv.org/pdf/1004.4710
-
-	pages 19ff. in the above online document.
-*/
-_private_div_recursion :: proc(quotient, remainder, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-
-	A1, A2, B1, B0, Q1, Q0, R1, R0, t := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
-	defer internal_destroy(A1, A2, B1, B0, Q1, Q0, R1, R0, t)
-
-	m := a.used - b.used
-	k := m / 2
-
-	if m < MUL_KARATSUBA_CUTOFF {
-		return _private_int_div_school(quotient, remainder, a, b)
-	}
-
-	internal_init_multi(A1, A2, B1, B0, Q1, Q0, R1, R0, t) or_return
-
-	/*
-		`B1` = `b` / `beta`^`k`, `B0` = `b` % `beta`^`k`
-	*/
-	internal_shrmod(B1, B0, b, k * _DIGIT_BITS) or_return
-
-	/*
-		(Q1, R1) =  RecursiveDivRem(A / beta^(2k), B1)
-	*/
-	internal_shrmod(A1, t, a, 2 * k * _DIGIT_BITS) or_return
-	_private_div_recursion(Q1, R1, A1, B1) or_return
-
-	/*
-		A1 = (R1 * beta^(2k)) + (A % beta^(2k)) - (Q1 * B0 * beta^k)
-	*/
-	_private_int_shl_leg(R1, 2 * k) or_return
-	internal_add(A1, R1, t) or_return
-	internal_mul(t, Q1, B0) or_return
-
-	/*
-		While A1 < 0 do Q1 = Q1 - 1, A1 = A1 + (beta^k * B)
-	*/
-	if internal_lt(A1, 0) {
-		internal_shl(t, b, k * _DIGIT_BITS) or_return
-
-		for {
-			internal_decr(Q1) or_return
-			internal_add(A1, A1, t) or_return
-			if internal_gte(A1, 0) { break }
-		}
-	}
-
-	/*
-		(Q0, R0) =  RecursiveDivRem(A1 / beta^(k), B1)
-	*/
-	internal_shrmod(A1, t, A1, k * _DIGIT_BITS) or_return
-	_private_div_recursion(Q0, R0, A1, B1) or_return
-
-	/*
-		A2 = (R0*beta^k) +  (A1 % beta^k) - (Q0*B0)
-	*/
-	_private_int_shl_leg(R0, k) or_return
-	internal_add(A2, R0, t) or_return
-	internal_mul(t, Q0, B0) or_return
-	internal_sub(A2, A2, t) or_return
-
-	/*
-		While A2 < 0 do Q0 = Q0 - 1, A2 = A2 + B.
-	*/
-	for internal_is_negative(A2) { // internal_lt(A2, 0) {
-		internal_decr(Q0) or_return
-		internal_add(A2, A2, b) or_return
-	}
-
-	/*
-		Return q = (Q1*beta^k) + Q0, r = A2.
-	*/
-	_private_int_shl_leg(Q1, k) or_return
-	internal_add(quotient, Q1, Q0) or_return
-
-	return internal_copy(remainder, A2)
-}
-
-_private_int_div_recursive :: proc(quotient, remainder, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-
-	A, B, Q, Q1, R, A_div, A_mod := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
-	defer internal_destroy(A, B, Q, Q1, R, A_div, A_mod)
-
-	internal_init_multi(A, B, Q, Q1, R, A_div, A_mod) or_return
-
-	/*
-		Most significant bit of a limb.
-		Assumes  _DIGIT_MAX < (sizeof(DIGIT) * sizeof(u8)).
-	*/
-	msb := (_DIGIT_MAX + DIGIT(1)) >> 1
-	sigma := 0
-	msb_b := b.digit[b.used - 1]
-	for msb_b < msb {
-		sigma += 1
-		msb_b <<= 1
-	}
-
-	/*
-		Use that sigma to normalize B.
-	*/
-	internal_shl(B, b, sigma) or_return
-	internal_shl(A, a, sigma) or_return
-
-	/*
-		Fix the sign.
-	*/
-	neg := a.sign != b.sign
-	A.sign = .Zero_or_Positive; B.sign = .Zero_or_Positive
-
-	/*
-		If the magnitude of "A" is not more more than twice that of "B" we can work
-		on them directly, otherwise we need to work at "A" in chunks.
-	*/
-	n := B.used
-	m := A.used - B.used
-
-	/*
-		Q = 0. We already ensured that when we called `internal_init_multi`.
-	*/
-	for m > n {
-		/*
-			(q, r) = RecursiveDivRem(A / (beta^(m-n)), B)
-		*/
-		j := (m - n) * _DIGIT_BITS
-		internal_shrmod(A_div, A_mod, A, j) or_return
-		_private_div_recursion(Q1, R, A_div, B) or_return
-
-		/*
-			Q = (Q*beta!(n)) + q
-		*/
-		internal_shl(Q, Q, n * _DIGIT_BITS) or_return
-		internal_add(Q, Q, Q1) or_return
-
-		/*
-			A = (r * beta^(m-n)) + (A % beta^(m-n))
-		*/
-		internal_shl(R, R, (m - n) * _DIGIT_BITS) or_return
-		internal_add(A, R, A_mod) or_return
-
-		/*
-			m = m - n
-		*/
-		m -= n
-	}
-
-	/*
-		(q, r) = RecursiveDivRem(A, B)
-	*/
-	_private_div_recursion(Q1, R, A, B) or_return
-
-	/*
-		Q = (Q * beta^m) + q, R = r
-	*/
-	internal_shl(Q, Q, m * _DIGIT_BITS) or_return
-	internal_add(Q, Q, Q1) or_return
-
-	/*
-		Get sign before writing to dest.
-	*/
-	R.sign = .Zero_or_Positive if internal_is_zero(Q) else a.sign
-
-	if quotient != nil {
-		swap(quotient, Q)
-		quotient.sign = .Negative if neg else .Zero_or_Positive
-	}
-	if remainder != nil {
-		/*
-			De-normalize the remainder.
-		*/
-		internal_shrmod(R, nil, R, sigma) or_return
-		swap(remainder, R)
-	}
-	return nil
-}
-
-/*
-	Slower bit-bang division... also smaller.
-*/
-@(deprecated="Use `_int_div_school`, it's 3.5x faster.")
-_private_int_div_small :: proc(quotient, remainder, numerator, denominator: ^Int) -> (err: Error) {
-
-	ta, tb, tq, q := &Int{}, &Int{}, &Int{}, &Int{}
-
-	defer internal_destroy(ta, tb, tq, q)
-
-	for {
-		internal_one(tq) or_return
-
-		num_bits, _ := count_bits(numerator)
-		den_bits, _ := count_bits(denominator)
-		n := num_bits - den_bits
-
-		abs(ta, numerator)   or_return
-		abs(tb, denominator) or_return
-		shl(tb, tb, n)       or_return
-		shl(tq, tq, n)       or_return
-
-		for n >= 0 {
-			if internal_gte(ta, tb) {
-				// ta -= tb
-				sub(ta, ta, tb) or_return
-				//  q += tq
-				add( q, q,  tq) or_return
-			}
-			shr1(tb, tb) or_return
-			shr1(tq, tq) or_return
-
-			n -= 1
-		}
-
-		/*
-			Now q == quotient and ta == remainder.
-		*/
-		neg := numerator.sign != denominator.sign
-		if quotient != nil {
-			swap(quotient, q)
-			z, _ := is_zero(quotient)
-			quotient.sign = .Negative if neg && !z else .Zero_or_Positive
-		}
-		if remainder != nil {
-			swap(remainder, ta)
-			z, _ := is_zero(numerator)
-			remainder.sign = .Zero_or_Positive if z else numerator.sign
-		}
-
-		break
-	}
-	return err
-}
-
-
-
-/*
-	Binary split factorial algo due to: http://www.luschny.de/math/factorial/binarysplitfact.html
-*/
-_private_int_factorial_binary_split :: proc(res: ^Int, n: int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-
-	inner, outer, start, stop, temp := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
-	defer internal_destroy(inner, outer, start, stop, temp)
-
-	internal_one(inner, false)                                       or_return
-	internal_one(outer, false)                                       or_return
-
-	bits_used := ilog2(n)
-
-	for i := bits_used; i >= 0; i -= 1 {
-		start := (n >> (uint(i) + 1)) + 1 | 1
-		stop  := (n >> uint(i)) + 1 | 1
-		_private_int_recursive_product(temp, start, stop, 0)         or_return
-		internal_mul(inner, inner, temp)                             or_return
-		internal_mul(outer, outer, inner)                            or_return
-	}
-	shift := n - intrinsics.count_ones(n)
-
-	return internal_shl(res, outer, int(shift))
-}
-
-/*
-	Recursive product used by binary split factorial algorithm.
-*/
-_private_int_recursive_product :: proc(res: ^Int, start, stop: int, level := int(0), allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-
-	t1, t2 := &Int{}, &Int{}
-	defer internal_destroy(t1, t2)
-
-	if level > FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS {
-		return .Max_Iterations_Reached
-	}
-
-	num_factors := (stop - start) >> 1
-	if num_factors == 2 {
-		internal_set(t1, start, false)                               or_return
-		when true {
-			internal_grow(t2, t1.used + 1, false)                    or_return
-			internal_add(t2, t1, 2)                                  or_return
-		} else {
-			internal_add(t2, t1, 2)                                  or_return
-		}
-		return internal_mul(res, t1, t2)
-	}
-
-	if num_factors > 1 {
-		mid := (start + num_factors) | 1
-		_private_int_recursive_product(t1, start,  mid, level + 1)   or_return
-		_private_int_recursive_product(t2,   mid, stop, level + 1)   or_return
-		return internal_mul(res, t1, t2)
-	}
-
-	if num_factors == 1 {
-		return #force_inline internal_set(res, start, true)
-	}
-
-	return #force_inline internal_one(res, true)
-}
-
-/*
-	Internal function computing both GCD using the binary method,
-		and, if target isn't `nil`, also LCM.
-
-	Expects the `a` and `b` to have been initialized
-		and one or both of `res_gcd` or `res_lcm` not to be `nil`.
-
-	If both `a` and `b` are zero, return zero.
-	If either `a` or `b`, return the other one.
-
-	The `gcd` and `lcm` wrappers have already done this test,
-	but `gcd_lcm` wouldn't have, so we still need to perform it.
-
-	If neither result is wanted, we have nothing to do.
-*/
-_private_int_gcd_lcm :: proc(res_gcd, res_lcm, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-
-	if res_gcd == nil && res_lcm == nil {
-		return nil
-	}
-
-	/*
-		We need a temporary because `res_gcd` is allowed to be `nil`.
-	*/
-	if a.used == 0 && b.used == 0 {
-		/*
-			GCD(0, 0) and LCM(0, 0) are both 0.
-		*/
-		if res_gcd != nil {
-			internal_zero(res_gcd) or_return
-		}
-		if res_lcm != nil {
-			internal_zero(res_lcm) or_return
-		}
-		return nil
-	} else if a.used == 0 {
-		/*
-			We can early out with GCD = B and LCM = 0
-		*/
-		if res_gcd != nil {
-			internal_abs(res_gcd, b) or_return
-		}
-		if res_lcm != nil {
-			internal_zero(res_lcm) or_return
-		}
-		return nil
-	} else if b.used == 0 {
-		/*
-			We can early out with GCD = A and LCM = 0
-		*/
-		if res_gcd != nil {
-			internal_abs(res_gcd, a) or_return
-		}
-		if res_lcm != nil {
-			internal_zero(res_lcm) or_return
-		}
-		return nil
-	}
-
-	temp_gcd_res := &Int{}
-	defer internal_destroy(temp_gcd_res)
-
-	/*
-		If neither `a` or `b` was zero, we need to compute `gcd`.
- 		Get copies of `a` and `b` we can modify.
- 	*/
-	u, v := &Int{}, &Int{}
-	defer internal_destroy(u, v)
-	internal_copy(u, a) or_return
-	internal_copy(v, b) or_return
-
- 	/*
- 		Must be positive for the remainder of the algorithm.
- 	*/
-	u.sign = .Zero_or_Positive; v.sign = .Zero_or_Positive
-
- 	/*
- 		B1.  Find the common power of two for `u` and `v`.
- 	*/
- 	u_lsb, _ := internal_count_lsb(u)
- 	v_lsb, _ := internal_count_lsb(v)
- 	k        := min(u_lsb, v_lsb)
-
-	if k > 0 {
-		/*
-			Divide the power of two out.
-		*/
-		internal_shr(u, u, k) or_return
-		internal_shr(v, v, k) or_return
-	}
-
-	/*
-		Divide any remaining factors of two out.
-	*/
-	if u_lsb != k {
-		internal_shr(u, u, u_lsb - k) or_return
-	}
-	if v_lsb != k {
-		internal_shr(v, v, v_lsb - k) or_return
-	}
-
-	for v.used != 0 {
-		/*
-			Make sure `v` is the largest.
-		*/
-		if internal_gt(u, v) {
-			/*
-				Swap `u` and `v` to make sure `v` is >= `u`.
-			*/
-			internal_swap(u, v)
-		}
-
-		/*
-			Subtract smallest from largest.
-		*/
-		internal_sub(v, v, u) or_return
-
-		/*
-			Divide out all factors of two.
-		*/
-		b, _ := internal_count_lsb(v)
-		internal_shr(v, v, b) or_return
-	}
-
- 	/*
- 		Multiply by 2**k which we divided out at the beginning.
- 	*/
- 	internal_shl(temp_gcd_res, u, k) or_return
- 	temp_gcd_res.sign = .Zero_or_Positive
-
-	/*
-		We've computed `gcd`, either the long way, or because one of the inputs was zero.
-		If we don't want `lcm`, we're done.
-	*/
-	if res_lcm == nil {
-		internal_swap(temp_gcd_res, res_gcd)
-		return nil
-	}
-
-	/*
-		Computes least common multiple as `|a*b|/gcd(a,b)`
-		Divide the smallest by the GCD.
-	*/
-	if internal_lt_abs(a, b) {
-		/*
-			Store quotient in `t2` such that `t2 * b` is the LCM.
-		*/
-		internal_div(res_lcm, a, temp_gcd_res) or_return
-		err = internal_mul(res_lcm, res_lcm, b)
-	} else {
-		/*
-			Store quotient in `t2` such that `t2 * a` is the LCM.
-		*/
-		internal_div(res_lcm, b, temp_gcd_res) or_return
-		err = internal_mul(res_lcm, res_lcm, a)
-	}
-
-	if res_gcd != nil {
-		internal_swap(temp_gcd_res, res_gcd)
-	}
-
-	/*
-		Fix the sign to positive and return.
-	*/
-	res_lcm.sign = .Zero_or_Positive
-	return err
-}
-
-/*
-	Internal implementation of log.
-	Assumes `a` not to be `nil` and to have been initialized.
-*/
-_private_int_log :: proc(a: ^Int, base: DIGIT, allocator := context.allocator) -> (res: int, err: Error) {
-	bracket_low, bracket_high, bracket_mid, t, bi_base := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
-	defer internal_destroy(bracket_low, bracket_high, bracket_mid, t, bi_base)
-
-	ic := #force_inline internal_cmp(a, base)
-	if ic == -1 || ic == 0 {
-		return 1 if ic == 0 else 0, nil
-	}
-	defer if err != nil {
-		res = -1
-	}
-
-	internal_set(bi_base, base, true, allocator)       or_return
-	internal_clear(bracket_mid, false, allocator)      or_return
-	internal_clear(t, false, allocator)                or_return
-	internal_one(bracket_low, false, allocator)        or_return
-	internal_set(bracket_high, base, false, allocator) or_return
-
-	low := 0; high := 1
-
-	/*
-		A kind of Giant-step/baby-step algorithm.
-		Idea shamelessly stolen from https://programmingpraxis.com/2010/05/07/integer-logarithms/2/
-		The effect is asymptotic, hence needs benchmarks to test if the Giant-step should be skipped
-		for small n.
-	*/
-
-	for {
-		/*
-			Iterate until `a` is bracketed between low + high.
-		*/
-		if #force_inline internal_gte(bracket_high, a) { break }
-
-		low = high
-		#force_inline internal_copy(bracket_low, bracket_high) or_return
-		high <<= 1
-		#force_inline internal_sqr(bracket_high, bracket_high) or_return
-	}
-
-	for (high - low) > 1 {
-		mid := (high + low) >> 1
-
-		#force_inline internal_pow(t, bi_base, mid - low) or_return
-
-		#force_inline internal_mul(bracket_mid, bracket_low, t) or_return
-
-		mc := #force_inline internal_cmp(a, bracket_mid)
-		switch mc {
-		case -1:
-			high = mid
-			internal_swap(bracket_mid, bracket_high)
-		case  0:
-			return mid, nil
-		case  1:
-			low = mid
-			internal_swap(bracket_mid, bracket_low)
-		}
-	}
-
-	fc := #force_inline internal_cmp(bracket_high, a)
-	res = high if fc == 0 else low
-
-	return
-}
-
-/*
-	Computes xR**-1 == x (mod N) via Montgomery Reduction.
-	This is an optimized implementation of `internal_montgomery_reduce`
-	which uses the comba method to quickly calculate the columns of the reduction.
-	Based on Algorithm 14.32 on pp.601 of HAC.
-*/
-_private_montgomery_reduce_comba :: proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-	W: [_WARRAY]_WORD = ---
-
-	if x.used > _WARRAY { return .Invalid_Argument }
-
-	/*
-		Get old used count.
-	*/
-	old_used := x.used
-
-	/*
-		Grow `x` as required.
-	*/
-	internal_grow(x, n.used + 1)                                     or_return
-
-	/*
-		First we have to get the digits of the input into an array of double precision words W[...]
-		Copy the digits of `x` into W[0..`x.used` - 1]
-	*/
-	ix: int
-	for ix = 0; ix < x.used; ix += 1 {
-		W[ix] = _WORD(x.digit[ix])
-	}
-
-	/*
-		Zero the high words of W[a->used..m->used*2].
-	*/
-	zero_upper := (n.used * 2) + 1
-	if ix < zero_upper {
-		for ix = x.used; ix < zero_upper; ix += 1 {
-			W[ix] = {}
-		}
-	}
-
-	/*
-		Now we proceed to zero successive digits from the least significant upwards.
-	*/
-	for ix = 0; ix < n.used; ix += 1 {
-		/*
-			`mu = ai * m' mod b`
-
-			We avoid a double precision multiplication (which isn't required)
-			by casting the value down to a DIGIT.  Note this requires
-			that W[ix-1] have the carry cleared (see after the inner loop)
-		*/
-		mu := ((W[ix] & _WORD(_MASK)) * _WORD(rho)) & _WORD(_MASK)
-
-		/*
-			`a = a + mu * m * b**i`
-		
-			This is computed in place and on the fly.  The multiplication
-		 	by b**i is handled by offseting which columns the results
-		 	are added to.
-		
-			Note the comba method normally doesn't handle carries in the
-			inner loop In this case we fix the carry from the previous
-			column since the Montgomery reduction requires digits of the
-			result (so far) [see above] to work.
-
-			This is	handled by fixing up one carry after the inner loop.
-			The carry fixups are done in order so after these loops the
-			first m->used words of W[] have the carries fixed.
-		*/
-		for iy := 0; iy < n.used; iy += 1 {
-			W[ix + iy] += mu * _WORD(n.digit[iy])
-		}
-
-		/*
-			Now fix carry for next digit, W[ix+1].
-		*/
-		W[ix + 1] += (W[ix] >> _DIGIT_BITS)
-	}
-
-	/*
-		Now we have to propagate the carries and shift the words downward
-		[all those least significant digits we zeroed].
-	*/
-
-	for ; ix < n.used * 2; ix += 1 {
-		W[ix + 1] += (W[ix] >> _DIGIT_BITS)
-	}
-
-	/* copy out, A = A/b**n
-	 *
-	 * The result is A/b**n but instead of converting from an
-	 * array of mp_word to mp_digit than calling mp_rshd
-	 * we just copy them in the right order
-	 */
-
-	for ix = 0; ix < (n.used + 1); ix += 1 {
-		x.digit[ix] = DIGIT(W[n.used + ix] & _WORD(_MASK))
-	}
-
-	/*
-		Set the max used.
-	*/
-	x.used = n.used + 1
-
-	/*
-		Zero old_used digits, if the input a was larger than m->used+1 we'll have to clear the digits.
-	*/
-	internal_zero_unused(x, old_used)
-	internal_clamp(x)
-
-	/*
-		if A >= m then A = A - m
-	*/
-	if internal_gte_abs(x, n) {
-		return internal_sub(x, x, n)
-	}
-	return nil
-}
-
-/*
-	Computes xR**-1 == x (mod N) via Montgomery Reduction.
-	Assumes `x` and `n` not to be nil.
-*/
-_private_int_montgomery_reduce :: proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-	/*
-		Can the fast reduction [comba] method be used?
-		Note that unlike in mul, you're safely allowed *less* than the available columns [255 per default],
-		since carries are fixed up in the inner loop.
-	*/
-	internal_clear_if_uninitialized(x, n) or_return
-
-	digs := (n.used * 2) + 1
-	if digs < _WARRAY && x.used <= _WARRAY && n.used < _MAX_COMBA {
-		return _private_montgomery_reduce_comba(x, n, rho)
-	}
-
-	/*
-		Grow the input as required
-	*/
-	internal_grow(x, digs)                                           or_return
-	x.used = digs
-
-	for ix := 0; ix < n.used; ix += 1 {
-		/*
-			`mu = ai * rho mod b`
-			The value of rho must be precalculated via `int_montgomery_setup()`,
-			such that it equals -1/n0 mod b this allows the following inner loop
-			to reduce the input one digit at a time.
-		*/
-
-		mu := DIGIT((_WORD(x.digit[ix]) * _WORD(rho)) & _WORD(_MASK))
-
-		/*
-			a = a + mu * m * b**i
-			Multiply and add in place.
-		*/
-		u  := DIGIT(0)
-		iy := int(0)
-		for ; iy < n.used; iy += 1 {
-			/*
-				Compute product and sum.
-			*/
-			r := (_WORD(mu) * _WORD(n.digit[iy]) + _WORD(u) + _WORD(x.digit[ix + iy]))
-
-			/*
-				Get carry.
-			*/
-			u = DIGIT(r >> _DIGIT_BITS)
-
-			/*
-				Fix digit.
-			*/
-			x.digit[ix + iy] = DIGIT(r & _WORD(_MASK))
-		}
-
-		/*
-			At this point the ix'th digit of x should be zero.
-			Propagate carries upwards as required.
-		*/
-		for u != 0 {
-			x.digit[ix + iy] += u
-			u = x.digit[ix + iy] >> _DIGIT_BITS
-			x.digit[ix + iy] &= _MASK
-			iy += 1
-		}
-	}
-
-	/*
-		At this point the n.used'th least significant digits of x are all zero,
-		which means we can shift x to the right by n.used digits and the
-		residue is unchanged.
-
-		x = x/b**n.used.
-	*/
-	internal_clamp(x)
-	_private_int_shr_leg(x, n.used)
-
-	/*
-		if x >= n then x = x - n
-	*/
-	if internal_gte_abs(x, n) {
-		return internal_sub(x, x, n)
-	}
-
-	return nil
-}
-
-/*
-	Shifts with subtractions when the result is greater than b.
-
-	The method is slightly modified to shift B unconditionally upto just under
-	the leading bit of b.  This saves alot of multiple precision shifting.
-
-	Assumes `a` and `b` not to be `nil`.
-*/
-_private_int_montgomery_calc_normalization :: proc(a, b: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-	/*
-		How many bits of last digit does b use.
-	*/
-	internal_clear_if_uninitialized(a, b) or_return
-
-	bits := internal_count_bits(b) % _DIGIT_BITS
-
-	if b.used > 1 {
-		power := ((b.used - 1) * _DIGIT_BITS) + bits - 1
-		internal_int_power_of_two(a, power)                          or_return
-	} else {
-		internal_one(a)                                              or_return
-		bits = 1
-	}
-
-	/*
-		Now compute C = A * B mod b.
-	*/
-	for x := bits - 1; x < _DIGIT_BITS; x += 1 {
-		internal_int_shl1(a, a)                                      or_return
-		if internal_gte_abs(a, b) {
-			internal_sub(a, a, b)                                    or_return
-		}
-	}
-	return nil
-}
-
-/*
-	Sets up the Montgomery reduction stuff.
-*/
-_private_int_montgomery_setup :: proc(n: ^Int, allocator := context.allocator) -> (rho: DIGIT, err: Error) {
-	/*
-		Fast inversion mod 2**k
-		Based on the fact that:
-
-		XA = 1 (mod 2**n) => (X(2-XA)) A = 1 (mod 2**2n)
-		                  =>  2*X*A - X*X*A*A = 1
-		                  =>  2*(1) - (1)     = 1
-	*/
-	internal_clear_if_uninitialized(n, allocator) or_return
-
-	b := n.digit[0]
-	if b & 1 == 0 { return 0, .Invalid_Argument }
-
-	x := (((b + 2) & 4) << 1) + b /* here x*a==1 mod 2**4 */
-	x *= 2 - (b * x)              /* here x*a==1 mod 2**8 */
-	x *= 2 - (b * x)              /* here x*a==1 mod 2**16 */
-
-	when _DIGIT_TYPE_BITS == 64 {
-		x *= 2 - (b * x)              /* here x*a==1 mod 2**32 */
-		x *= 2 - (b * x)              /* here x*a==1 mod 2**64 */
-	}
-
-	/*
-		rho = -1/m mod b
-	*/
-	rho = DIGIT(((_WORD(1) << _WORD(_DIGIT_BITS)) - _WORD(x)) & _WORD(_MASK))
-	return rho, nil
-}
-
-/*
-	Reduces `x` mod `m`, assumes 0 < x < m**2, mu is precomputed via reduce_setup.
-	From HAC pp.604 Algorithm 14.42
-
-	Assumes `x`, `m` and `mu` all not to be `nil` and have been initialized.
-*/
-_private_int_reduce :: proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-
-	q := &Int{}
-	defer internal_destroy(q)
-	um := m.used
-
-	/*
-		q = x
-	*/
-	internal_copy(q, x)                                              or_return
-
-	/*
-		q1 = x / b**(k-1)
-	*/
-	_private_int_shr_leg(q, um - 1)
-
-	/*
-		According to HAC this optimization is ok.
-	*/
-	if DIGIT(um) > DIGIT(1) << (_DIGIT_BITS - 1) {
-		internal_mul(q, q, mu)                                       or_return
-	} else {
-		_private_int_mul_high(q, q, mu, um)                          or_return
-	}
-
-	/*
-		q3 = q2 / b**(k+1)
-	*/
-	_private_int_shr_leg(q, um + 1)
-
-	/*
-		x = x mod b**(k+1), quick (no division)
-	*/
-	internal_int_mod_bits(x, x, _DIGIT_BITS * (um + 1))              or_return
-
-	/*
-		q = q * m mod b**(k+1), quick (no division)
-	*/
-	_private_int_mul(q, q, m, um + 1)                                or_return
-
-	/*
-		x = x - q
-	*/
-	internal_sub(x, x, q)                                            or_return
-
-	/*
-		If x < 0, add b**(k+1) to it.
-	*/
-	if internal_is_negative(x) {
-		internal_set(q, 1)                                           or_return
-		_private_int_shl_leg(q, um + 1)                                or_return
-		internal_add(x, x, q)                                        or_return
-	}
-
-	/*
-		Back off if it's too big.
-	*/
-	for internal_gte(x, m) {
-		internal_sub(x, x, m)                                        or_return
-	}
-
-	return nil
-}
-
-/*
-	Reduces `a` modulo `n`, where `n` is of the form 2**p - d.
-*/
-_private_int_reduce_2k :: proc(a, n: ^Int, d: DIGIT, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-
-	q := &Int{}
-	defer internal_destroy(q)
-
-	internal_zero(q)                                                 or_return
-
-	p := internal_count_bits(n)
-
-	for {
-		/*
-			q = a/2**p, a = a mod 2**p
-		*/
-		internal_shrmod(q, a, a, p)                                  or_return
-
-		if d != 1 {
-			/*
-				q = q * d
-			*/
-			internal_mul(q, q, d)                                    or_return
-		}
-
-		/*
-			a = a + q
-		*/
-		internal_add(a, a, q)                                        or_return
-		if internal_lt_abs(a, n)                                     { break }
-		internal_sub(a, a, n)                                        or_return
-	}
-
-	return nil
-}
-
-/*
-	Reduces `a` modulo `n` where `n` is of the form 2**p - d
-	This differs from reduce_2k since "d" can be larger than a single digit.
-*/
-_private_int_reduce_2k_l :: proc(a, n, d: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-
-	q := &Int{}
-	defer internal_destroy(q)
-
-	internal_zero(q)                                                 or_return
-
-	p := internal_count_bits(n)
-
-	for {
-		/*
-			q = a/2**p, a = a mod 2**p
-		*/
-		internal_shrmod(q, a, a, p)                                  or_return
-
-		/*
-			q = q * d
-		*/
-		internal_mul(q, q, d)                                        or_return
-
-		/*
-			a = a + q
-		*/
-		internal_add(a, a, q)                                        or_return
-		if internal_lt_abs(a, n)                                     { break }
-		internal_sub(a, a, n)                                        or_return
-	}
-
-	return nil
-}
-
-/*
-	Determines if `internal_int_reduce_2k` can be used.
-	Asssumes `a` not to be `nil` and to have been initialized.
-*/
-_private_int_reduce_is_2k :: proc(a: ^Int) -> (reducible: bool, err: Error) {
-	assert_if_nil(a)
-
-	if internal_is_zero(a) {
-		return false, nil
-	} else if a.used == 1 {
-		return true, nil
-	} else if a.used  > 1 {
-		iy := internal_count_bits(a)
-		iw := 1
-		iz := DIGIT(1)
-
-		/*
-			Test every bit from the second digit up, must be 1.
-		*/
-		for ix := _DIGIT_BITS; ix < iy; ix += 1 {
-			if a.digit[iw] & iz == 0 {
-				return false, nil
-			}
-
-			iz <<= 1
-			if iz > _DIGIT_MAX {
-				iw += 1
-				iz  = 1
-			}
-		}
-		return true, nil
-	} else {
-		return true, nil
-	}
-}
-
-/*
-	Determines if `internal_int_reduce_2k_l` can be used.
-	Asssumes `a` not to be `nil` and to have been initialized.
-*/
-_private_int_reduce_is_2k_l :: proc(a: ^Int) -> (reducible: bool, err: Error) {
-	assert_if_nil(a)
-
-	if internal_int_is_zero(a) {
-		return false, nil
-	} else if a.used == 1 {
-		return true, nil
-	} else if a.used  > 1 {
-		/*
-			If more than half of the digits are -1 we're sold.
-		*/
-		ix := 0
-		iy := 0
-
-		for ; ix < a.used; ix += 1 {
-			if a.digit[ix] == _DIGIT_MAX {
-				iy += 1
-			}
-		}
-		return iy >= (a.used / 2), nil
-	} else {
-		return false, nil
-	}
-}
-
-/*
-	Determines the setup value.
-	Assumes `a` is not `nil`.
-*/
-_private_int_reduce_2k_setup :: proc(a: ^Int, allocator := context.allocator) -> (d: DIGIT, err: Error) {
-	context.allocator = allocator
-
-	tmp := &Int{}
-	defer internal_destroy(tmp)
-	internal_zero(tmp)                                               or_return
-
-	internal_int_power_of_two(tmp, internal_count_bits(a))           or_return
-	internal_sub(tmp, tmp, a)                                        or_return
-
-	return tmp.digit[0], nil
-}
-
-/*
-	Determines the setup value.
-	Assumes `mu` and `P` are not `nil`.
-
-	d := (1 << a.bits) - a;
-*/
-_private_int_reduce_2k_setup_l :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-
-	tmp := &Int{}
-	defer internal_destroy(tmp)
-	internal_zero(tmp)                                               or_return
-
-	internal_int_power_of_two(tmp, internal_count_bits(P))           or_return
-	internal_sub(mu, tmp, P)                                         or_return
-
-	return nil
-}
-
-/*
-	Pre-calculate the value required for Barrett reduction.
-	For a given modulus "P" it calulates the value required in "mu"
-	Assumes `mu` and `P` are not `nil`.
-*/
-_private_int_reduce_setup :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-
-	internal_int_power_of_two(mu, P.used * 2 * _DIGIT_BITS)           or_return
-	return internal_int_div(mu, mu, P)
-}
-
-/*
-	Determines the setup value.
-	Assumes `a` to not be `nil` and to have been initialized.
-*/
-_private_int_dr_setup :: proc(a: ^Int) -> (d: DIGIT) {
-	/*
-		The casts are required if _DIGIT_BITS is one less than
-		the number of bits in a DIGIT [e.g. _DIGIT_BITS==31].
-	*/
-	return DIGIT((1 << _DIGIT_BITS) - a.digit[0])
-}
-
-/*
-	Determines if a number is a valid DR modulus.
-	Assumes `a` to not be `nil` and to have been initialized.
-*/
-_private_dr_is_modulus :: proc(a: ^Int) -> (res: bool) {
-	/*
-		Must be at least two digits.
-	*/
-	if a.used < 2 { return false }
-
-	/*
-		Must be of the form b**k - a [a <= b] so all but the first digit must be equal to -1 (mod b).
-	*/
-	for ix := 1; ix < a.used; ix += 1 {
-		if a.digit[ix] != _MASK {
-			return false
-		}
-	}
-	return true
-}
-
-/*
-	Reduce "x" in place modulo "n" using the Diminished Radix algorithm.
-	Based on algorithm from the paper
-
-		"Generating Efficient Primes for Discrete Log Cryptosystems"
-					Chae Hoon Lim, Pil Joong Lee,
-			POSTECH Information Research Laboratories
-
-	The modulus must be of a special format [see manual].
-	Has been modified to use algorithm 7.10 from the LTM book instead
-
-	Input x must be in the range 0 <= x <= (n-1)**2
-	Assumes `x` and `n` to not be `nil` and to have been initialized.
-*/
-_private_int_dr_reduce :: proc(x, n: ^Int, k: DIGIT, allocator := context.allocator) -> (err: Error) {
-	/*
-		m = digits in modulus.
-	*/
-	m := n.used
-
-	/*
-		Ensure that "x" has at least 2m digits.
-	*/
-	internal_grow(x, m + m)                                          or_return
-
-	/*
-		Top of loop, this is where the code resumes if another reduction pass is required.
-	*/
-	for {
-		i: int
-		mu := DIGIT(0)
-
-		/*
-			Compute (x mod B**m) + k * [x/B**m] inline and inplace.
-		*/
-		for i = 0; i < m; i += 1 {
-			r         := _WORD(x.digit[i + m]) * _WORD(k) + _WORD(x.digit[i] + mu)
-			x.digit[i] = DIGIT(r & _WORD(_MASK))
-			mu         = DIGIT(r >> _WORD(_DIGIT_BITS))
-		}
-
-		/*
-			Set final carry.
-		*/
-		x.digit[i] = mu
-
-		/*
-			Zero words above m.
-		*/
-		mem.zero_slice(x.digit[m + 1:][:x.used - m])
-
-		/*
-			Clamp, sub and return.
-		*/
-		internal_clamp(x)                                            or_return
-
-		/*
-			If x >= n then subtract and reduce again.
-			Each successive "recursion" makes the input smaller and smaller.
-		*/
-		if internal_lt_abs(x, n) { break }
-
-		internal_sub(x, x, n)                                        or_return
-	}
-	return nil
-}
-
-/*
-	Computes res == G**X mod P.
-	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
-*/
-_private_int_exponent_mod :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-
-	M := [_TAB_SIZE]Int{}
-	winsize: uint
-
-	/*
-		Use a pointer to the reduction algorithm.
-		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
-	*/
-	redux: #type proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error)
-
-	defer {
-		internal_destroy(&M[1])
-		for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
-			internal_destroy(&M[x])
-		}
-	}
-
-	/*
-		Find window size.
-	*/
-	x := internal_count_bits(X)
-	switch {
-	case x <= 7:
-		winsize = 2
-	case x <= 36:
-		winsize = 3
-	case x <= 140:
-		winsize = 4
-	case x <= 450:
-		winsize = 5
-	case x <= 1303:
-		winsize = 6
-	case x <= 3529:
-		winsize = 7
-	case:
-		winsize = 8
-	}
-
-	winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize
-
-	/*
-		Init M array.
-		Init first cell.
-	*/
-	internal_zero(&M[1])                                             or_return
-
-	/*
-		Now init the second half of the array.
-	*/
-	for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
-		internal_zero(&M[x])                                         or_return
-	}
-
-	/*
-		Create `mu`, used for Barrett reduction.
-	*/
-	mu := &Int{}
-	defer internal_destroy(mu)
-	internal_zero(mu)                                                or_return
-
-	if redmode == 0 {
-		_private_int_reduce_setup(mu, P)                             or_return
-		redux = _private_int_reduce
-	} else {
-		_private_int_reduce_2k_setup_l(mu, P)                        or_return
-		redux = _private_int_reduce_2k_l
-	}
-
-	/*
-		Create M table.
-
-		The M table contains powers of the base, e.g. M[x] = G**x mod P.
-		The first half of the table is not computed, though, except for M[0] and M[1].
-	*/
-	internal_int_mod(&M[1], G, P)                                    or_return
-
-	/*
-		Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
-
-		TODO: This can probably be replaced by computing the power and using `pow` to raise to it
-		instead of repeated squaring.
-	*/
-	slot := 1 << (winsize - 1)
-	internal_copy(&M[slot], &M[1])                                   or_return
-
-	for x = 0; x < int(winsize - 1); x += 1 {
-		/*
-			Square it.
-		*/
-		internal_sqr(&M[slot], &M[slot])                             or_return
-
-		/*
-			Reduce modulo P
-		*/
-		redux(&M[slot], P, mu)                                       or_return
-	}
-
-	/*
-		Create upper table, that is M[x] = M[x-1] * M[1] (mod P)
-		for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
-	*/
-	for x = slot + 1; x < (1 << winsize); x += 1 {
-		internal_mul(&M[x], &M[x - 1], &M[1])                        or_return
-		redux(&M[x], P, mu)                                          or_return
-	}
-
-	/*
-		Setup result.
-	*/
-	internal_one(res)                                                or_return
-
-	/*
-		Set initial mode and bit cnt.
-	*/
-	mode   := 0
-	bitcnt := 1
-	buf    := DIGIT(0)
-	digidx := X.used - 1
-	bitcpy := uint(0)
-	bitbuf := DIGIT(0)
-
-	for {
-		/*
-			Grab next digit as required.
-		*/
-		bitcnt -= 1
-		if bitcnt == 0 {
-			/*
-				If digidx == -1 we are out of digits.
-			*/
-			if digidx == -1 { break }
-
-			/*
-				Read next digit and reset the bitcnt.
-			*/
-			buf    = X.digit[digidx]
-			digidx -= 1
-			bitcnt = _DIGIT_BITS
-		}
-
-		/*
-			Grab the next msb from the exponent.
-		*/
-		y := buf >> (_DIGIT_BITS - 1) & 1
-		buf <<= 1
-
-		/*
-			If the bit is zero and mode == 0 then we ignore it.
-			These represent the leading zero bits before the first 1 bit
-			in the exponent.  Technically this opt is not required but it
-			does lower the # of trivial squaring/reductions used.
-		*/
-		if mode == 0 && y == 0 {
-			continue
-		}
-
-		/*
-			If the bit is zero and mode == 1 then we square.
-		*/
-		if mode == 1 && y == 0 {
-			internal_sqr(res, res)                                   or_return
-			redux(res, P, mu)                                        or_return
-			continue
-		}
-
-		/*
-			Else we add it to the window.
-		*/
-		bitcpy += 1
-		bitbuf |= (y << (winsize - bitcpy))
-		mode    = 2
-
-		if (bitcpy == winsize) {
-			/*
-				Window is filled so square as required and multiply.
-				Square first.
-			*/
-			for x = 0; x < int(winsize); x += 1 {
-				internal_sqr(res, res)                               or_return
-				redux(res, P, mu)                                    or_return
-			}
-
-			/*
-				Then multiply.
-			*/
-			internal_mul(res, res, &M[bitbuf])                       or_return
-			redux(res, P, mu)                                        or_return
-
-			/*
-				Empty window and reset.
-			*/
-			bitcpy = 0
-			bitbuf = 0
-			mode   = 1
-		}
-	}
-
-	/*
-		If bits remain then square/multiply.
-	*/
-	if mode == 2 && bitcpy > 0 {
-		/*
-			Square then multiply if the bit is set.
-		*/
-		for x = 0; x < int(bitcpy); x += 1 {
-			internal_sqr(res, res)                                   or_return
-			redux(res, P, mu)                                        or_return
-
-			bitbuf <<= 1
-			if ((bitbuf & (1 << winsize)) != 0) {
-				/*
-					Then multiply.
-				*/
-				internal_mul(res, res, &M[1])                        or_return
-				redux(res, P, mu)                                    or_return
-			}
-		}
-	}
-	return err
-}
-
-/*
-	Computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
-
-	Uses a left-to-right `k`-ary sliding window to compute the modular exponentiation.
-	The value of `k` changes based on the size of the exponent.
-
-	Uses Montgomery or Diminished Radix reduction [whichever appropriate]
-
-	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
-*/
-_private_int_exponent_mod_fast :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-
-	M := [_TAB_SIZE]Int{}
-	winsize: uint
-
-	/*
-		Use a pointer to the reduction algorithm.
-		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
-	*/
-	redux: #type proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error)
-
-	defer {
-		internal_destroy(&M[1])
-		for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
-			internal_destroy(&M[x])
-		}
-	}
-
-	/*
-		Find window size.
-	*/
-	x := internal_count_bits(X)
-	switch {
-	case x <= 7:
-		winsize = 2
-	case x <= 36:
-		winsize = 3
-	case x <= 140:
-		winsize = 4
-	case x <= 450:
-		winsize = 5
-	case x <= 1303:
-		winsize = 6
-	case x <= 3529:
-		winsize = 7
-	case:
-		winsize = 8
-	}
-
-	winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize
-
-	/*
-		Init M array
-		Init first cell.
-	*/
-	cap := internal_int_allocated_cap(P)
-	internal_grow(&M[1], cap)                                        or_return
-
-	/*
-		Now init the second half of the array.
-	*/
-	for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
-		internal_grow(&M[x], cap)                                    or_return
-	}
-
-	/*
-		Determine and setup reduction code.
-	*/
-	rho: DIGIT
-
-	if redmode == 0 {
-		/*
-			Now setup Montgomery.
-		*/
-		rho = _private_int_montgomery_setup(P)                       or_return
-
-		/*
-			Automatically pick the comba one if available (saves quite a few calls/ifs).
-		*/
-		if ((P.used * 2) + 1) < _WARRAY && P.used < _MAX_COMBA {
-			redux = _private_montgomery_reduce_comba
-		} else {
-			/*
-				Use slower baseline Montgomery method.
-			*/
-			redux = _private_int_montgomery_reduce
-		}
-	} else if redmode == 1 {
-		/*
-			Setup DR reduction for moduli of the form B**k - b.
-		*/
-		rho = _private_int_dr_setup(P)
-		redux = _private_int_dr_reduce
-	} else {
-		/*
-			Setup DR reduction for moduli of the form 2**k - b.
-		*/
-		rho = _private_int_reduce_2k_setup(P)                        or_return
-		redux = _private_int_reduce_2k
-	}
-
-	/*
-		Setup result.
-	*/
-	internal_grow(res, cap)                                          or_return
-
-	/*
-		Create M table
-		The first half of the table is not computed, though, except for M[0] and M[1]
-	*/
-
-	if redmode == 0 {
-		/*
-			Now we need R mod m.
-		*/
-		_private_int_montgomery_calc_normalization(res, P)           or_return
-
-		/*
-			Now set M[1] to G * R mod m.
-		*/
-		internal_mulmod(&M[1], G, res, P)                            or_return
-	} else {
-		internal_one(res)                                            or_return
-		internal_mod(&M[1], G, P)                                    or_return
-	}
-
-	/*
-		Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
-	*/
-	slot := 1 << (winsize - 1)
-	internal_copy(&M[slot], &M[1])                                   or_return
-
-	for x = 0; x < int(winsize - 1); x += 1 {
-		internal_sqr(&M[slot], &M[slot])                             or_return
-		redux(&M[slot], P, rho)                                      or_return
-	}
-
-	/*
-		Create upper table.
-	*/
-	for x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x += 1 {
-		internal_mul(&M[x], &M[x - 1], &M[1])                        or_return
-		redux(&M[x], P, rho)                                         or_return
-	}
-
-	/*
-		Set initial mode and bit cnt.
-	*/
-	mode   := 0
-	bitcnt := 1
-	buf    := DIGIT(0)
-	digidx := X.used - 1
-	bitcpy := 0
-	bitbuf := DIGIT(0)
-
-	for {
-		/*
-			Grab next digit as required.
-		*/
-		bitcnt -= 1
-		if bitcnt == 0 {
-			/*
-				If digidx == -1 we are out of digits so break.
-			*/
-			if digidx == -1 { break }
-
-			/*
-				Read next digit and reset the bitcnt.
-			*/
-			buf    = X.digit[digidx]
-			digidx -= 1
-			bitcnt = _DIGIT_BITS
-		}
-
-		/*
-			Grab the next msb from the exponent.
-		*/
-		y := (buf >> (_DIGIT_BITS - 1)) & 1
-		buf <<= 1
-
-		/*
-			If the bit is zero and mode == 0 then we ignore it.
-			These represent the leading zero bits before the first 1 bit in the exponent.
-			Technically this opt is not required but it does lower the # of trivial squaring/reductions used.
-		*/
-		if mode == 0 && y == 0 { continue }
-
-		/*
-			If the bit is zero and mode == 1 then we square.
-		*/
-		if mode == 1 && y == 0 {
-			internal_sqr(res, res)                                   or_return
-			redux(res, P, rho)                                       or_return
-			continue
-		}
-
-		/*
-			Else we add it to the window.
-		*/
-		bitcpy += 1
-		bitbuf |= (y << (winsize - uint(bitcpy)))
-		mode    = 2
-
-		if bitcpy == int(winsize) {
-			/*
-				Window is filled so square as required and multiply
-				Square first.
-			*/
-			for x = 0; x < int(winsize); x += 1 {
-				internal_sqr(res, res)                               or_return
-				redux(res, P, rho)                                   or_return
-			}
-
-			/*
-				Then multiply.
-			*/
-			internal_mul(res, res, &M[bitbuf])                       or_return
-			redux(res, P, rho)                                       or_return
-
-			/*
-				Empty window and reset.
-			*/
-			bitcpy = 0
-			bitbuf = 0
-			mode   = 1
-		}
-	}
-
-	/*
-		If bits remain then square/multiply.
-	*/
-	if mode == 2 && bitcpy > 0 {
-		/*
-			Square then multiply if the bit is set.
-		*/
-		for x = 0; x < bitcpy; x += 1 {
-			internal_sqr(res, res)                                   or_return
-			redux(res, P, rho)                                       or_return
-
-			/*
-				Get next bit of the window.
-			*/
-			bitbuf <<= 1
-			if bitbuf & (1 << winsize) != 0 {
-				/*
-					Then multiply.
-				*/
-				internal_mul(res, res, &M[1])                        or_return
-				redux(res, P, rho)                                   or_return
-			}
-		}
-	}
-
-	if redmode == 0 {
-		/*
-			Fixup result if Montgomery reduction is used.
-			Recall that any value in a Montgomery system is actually multiplied by R mod n.
-			So we have to reduce one more time to cancel out the factor of R.
-		*/
-		redux(res, P, rho)                                           or_return
-	}
-
-	return nil
-}
-
-/*
-	hac 14.61, pp608
-*/
-_private_inverse_modulo :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-	x, y, u, v, A, B, C, D := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
-	defer internal_destroy(x, y, u, v, A, B, C, D)
-
-	/*
-		`b` cannot be negative.
-	*/
-	if b.sign == .Negative || internal_is_zero(b) {
-		return .Invalid_Argument
-	}
-
-	/*
-		init temps.
-	*/
-	internal_init_multi(x, y, u, v, A, B, C, D) or_return
-
-	/*
-		`x` = `a` % `b`, `y` = `b`
-	*/
-	internal_mod(x, a, b) or_return
-	internal_copy(y, b) or_return
-
-	/*
-		2. [modified] if x,y are both even then return an error!
-	*/
-	if internal_is_even(x) && internal_is_even(y) {
-		return .Invalid_Argument
-	}
-
-	/*
-		3. u=x, v=y, A=1, B=0, C=0, D=1
-	*/
-	internal_copy(u, x) or_return
-	internal_copy(v, y) or_return
-	internal_one(A) or_return
-	internal_one(D) or_return
-
-	for {
-		/*
-			4.  while `u` is even do:
-		*/
-		for internal_is_even(u) {
-			/*
-				4.1 `u` = `u` / 2
-			*/
-			internal_int_shr1(u, u) or_return
-
-			/*
-				4.2 if `A` or `B` is odd then:
-			*/
-			if internal_is_odd(A) || internal_is_odd(B) {
-				/*
-					`A` = (`A`+`y`) / 2, `B` = (`B`-`x`) / 2
-				*/
-				internal_add(A, A, y) or_return
-				internal_add(B, B, x) or_return
-			}
-			/*
-				`A` = `A` / 2, `B` = `B` / 2
-			*/
-			internal_int_shr1(A, A) or_return
-			internal_int_shr1(B, B) or_return
-		}
-
-		/*
-			5.  while `v` is even do:
-		*/
-		for internal_is_even(v) {
-			/*
-				5.1 `v` = `v` / 2
-			*/
-			internal_int_shr1(v, v) or_return
-
-			/*
-				5.2 if `C` or `D` is odd then:
-			*/
-			if internal_is_odd(C) || internal_is_odd(D) {
-				/*
-					`C` = (`C`+`y`) / 2, `D` = (`D`-`x`) / 2
-				*/
-				internal_add(C, C, y) or_return
-				internal_add(D, D, x) or_return
-			}
-			/*
-				`C` = `C` / 2, `D` = `D` / 2
-			*/
-			internal_int_shr1(C, C) or_return
-			internal_int_shr1(D, D) or_return
-		}
-
-		/*
-			6.  if `u` >= `v` then:
-		*/
-		if internal_cmp(u, v) != -1 {
-			/*
-				`u` = `u` - `v`, `A` = `A` - `C`, `B` = `B` - `D`
-			*/
-			internal_sub(u, u, v) or_return
-			internal_sub(A, A, C) or_return
-			internal_sub(B, B, D) or_return
-		} else {
-			/* v - v - u, C = C - A, D = D - B */
-			internal_sub(v, v, u) or_return
-			internal_sub(C, C, A) or_return
-			internal_sub(D, D, B) or_return
-		}
-
-		/*
-			If not zero goto step 4
-		*/
-		if internal_is_zero(u) {
-			break
-		}
-	}
-
-	/*
-		Now `a` = `C`, `b` = `D`, `gcd` == `g`*`v`
-	*/
-
-	/*
-		If `v` != `1` then there is no inverse.
-	*/
-	if !internal_eq(v, 1) {
-		return .Invalid_Argument
-	}
-
-	/*
-		If its too low.
-	*/
-	if internal_is_negative(C) {
-		internal_add(C, C, b) or_return
-	}
-
-	/*
-		Too big.
-	*/
-	if internal_gte(C, 0) {
-		internal_sub(C, C, b) or_return
-	}
-
-	/*
-		`C` is now the inverse.
-	*/
-	swap(dest, C)
-
-	return
-}
-
-/*
-	Computes the modular inverse via binary extended Euclidean algorithm, that is `dest` = 1 / `a` mod `b`.
-
-	Based on slow invmod except this is optimized for the case where `b` is odd,
-	as per HAC Note 14.64 on pp. 610.
-*/
-_private_inverse_modulo_odd :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-	x, y, u, v, B, D := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
-	defer internal_destroy(x, y, u, v, B, D)
-
-	sign: Sign
-
-	/*
-		2. [modified] `b` must be odd.
-	*/
-	if internal_is_even(b) { return .Invalid_Argument }
-
-	/*
-		Init all our temps.
-	*/
-	internal_init_multi(x, y, u, v, B, D) or_return
-
-	/*
-		`x` == modulus, `y` == value to invert.
-	*/
-	internal_copy(x, b) or_return
-
-	/*
-		We need `y` = `|a|`.
-	*/
-	internal_mod(y, a, b) or_return
-
-	/*
-		If one of `x`, `y` is zero return an error!
-	*/
-	if internal_is_zero(x) || internal_is_zero(y) { return .Invalid_Argument }
-
-	/*
-		3. `u` = `x`, `v` = `y`, `A` = 1, `B` = 0, `C` = 0, `D` = 1
-	*/
-	internal_copy(u, x) or_return
-	internal_copy(v, y) or_return
-
-	internal_one(D) or_return
-
-	for {
-		/*
-			4.  while `u` is even do.
-		*/
-		for internal_is_even(u) {
-			/*
-				4.1 `u` = `u` / 2
-			*/
-			internal_int_shr1(u, u) or_return
-
-			/*
-				4.2 if `B` is odd then:
-			*/
-			if internal_is_odd(B) {
-				/*
-					`B` = (`B` - `x`) / 2
-				*/
-				internal_sub(B, B, x) or_return
-			}
-
-			/*
-				`B` = `B` / 2
-			*/
-			internal_int_shr1(B, B) or_return
-		}
-
-		/*
-			5.  while `v` is even do:
-		*/
-		for internal_is_even(v) {
-			/*
-				5.1 `v` = `v` / 2
-			*/
-			internal_int_shr1(v, v) or_return
-
-			/*
-				5.2 if `D` is odd then:
-			*/
-			if internal_is_odd(D) {
-				/*
-					`D` = (`D` - `x`) / 2
-				*/
-				internal_sub(D, D, x) or_return
-			}
-			/*
-				`D` = `D` / 2
-			*/
-			internal_int_shr1(D, D) or_return
-		}
-
-		/*
-			6.  if `u` >= `v` then:
-		*/
-		if internal_cmp(u, v) != -1 {
-			/*
-				`u` = `u` - `v`, `B` = `B` - `D`
-			*/
-			internal_sub(u, u, v) or_return
-			internal_sub(B, B, D) or_return
-		} else {
-			/*
-				`v` - `v` - `u`, `D` = `D` - `B`
-			*/
-			internal_sub(v, v, u) or_return
-			internal_sub(D, D, B) or_return
-		}
-
-		/*
-			If not zero goto step 4.
-		*/
-		if internal_is_zero(u) { break }
-	}
-
-	/*
-		Now `a` = C, `b` = D, gcd == g*v
-	*/
-
-	/*
-		if `v` != 1 then there is no inverse
-	*/
-	if internal_cmp(v, 1) != 0 {
-		return .Invalid_Argument
-	}
-
-	/*
-		`b` is now the inverse.
-	*/
-	sign = a.sign
-	for internal_int_is_negative(D) {
-		internal_add(D, D, b) or_return
-	}
-
-	/*
-		Too big.
-	*/
-	for internal_gte_abs(D, b) {
-		internal_sub(D, D, b) or_return
-	}
-
-	swap(dest, D)
-	dest.sign = sign
-	return nil
-}
-
-
-/*
-	Returns the log2 of an `Int`.
-	Assumes `a` not to be `nil` and to have been initialized.
-	Also assumes `base` is a power of two.
-*/
-_private_log_power_of_two :: proc(a: ^Int, base: DIGIT) -> (log: int, err: Error) {
-	base := base
-	y: int
-	for y = 0; base & 1 == 0; {
-		y += 1
-		base >>= 1
-	}
-	log = internal_count_bits(a)
-	return (log - 1) / y, err
-}
-
-/*
-	Copies DIGITs from `src` to `dest`.
-	Assumes `src` and `dest` to not be `nil` and have been initialized.
-*/
-_private_copy_digits :: proc(dest, src: ^Int, digits: int, offset := int(0)) -> (err: Error) {
-	digits := digits
-	/*
-		If dest == src, do nothing
-	*/
-	if dest == src {
-		return nil
-	}
-
-	digits = min(digits, len(src.digit), len(dest.digit))
-	mem.copy_non_overlapping(&dest.digit[0], &src.digit[offset], size_of(DIGIT) * digits)
-	return nil
-}
-
-
-/*
-	Shift left by `digits` * _DIGIT_BITS bits.
-*/
-_private_int_shl_leg :: proc(quotient: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-
-	if digits <= 0 { return nil }
-
-	/*
-		No need to shift a zero.
-	*/
-	if #force_inline internal_is_zero(quotient) {
-		return nil
-	}
-
-	/*
-		Resize `quotient` to accomodate extra digits.
-	*/
-	#force_inline internal_grow(quotient, quotient.used + digits) or_return
-
-	/*
-		Increment the used by the shift amount then copy upwards.
-	*/
-
-	/*
-		Much like `_private_int_shr_leg`, this is implemented using a sliding window,
-		except the window goes the other way around.
-	*/
-	#no_bounds_check for x := quotient.used; x > 0; x -= 1 {
-		quotient.digit[x+digits-1] = quotient.digit[x-1]
-	}
-
-	quotient.used += digits
-	mem.zero_slice(quotient.digit[:digits])
-	return nil
-}
-
-/*
-	Shift right by `digits` * _DIGIT_BITS bits.
-*/
-_private_int_shr_leg :: proc(quotient: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
-	context.allocator = allocator
-
-	if digits <= 0 { return nil }
-
-	/*
-		If digits > used simply zero and return.
-	*/
-	if digits > quotient.used { return internal_zero(quotient) }
-
-	/*
-		Much like `int_shl_digit`, this is implemented using a sliding window,
-		except the window goes the other way around.
-
-		b-2 | b-1 | b0 | b1 | b2 | ... | bb |   ---->
-					/\                   |      ---->
-					 \-------------------/      ---->
-	*/
-
-	#no_bounds_check for x := 0; x < (quotient.used - digits); x += 1 {
-		quotient.digit[x] = quotient.digit[x + digits]
-	}
-	quotient.used -= digits
-	internal_zero_unused(quotient)
-	return internal_clamp(quotient)
-}
-
-/*	
-	========================    End of private procedures    =======================
-
-	===============================  Private tables  ===============================
-
-	Tables used by `internal_*` and `_*`.
-*/
-
-_private_int_rem_128 := [?]DIGIT{
-	0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
-	0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
-	1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
-	1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
-	0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
-	1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
-	1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
-	1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
-}
-#assert(128 * size_of(DIGIT) == size_of(_private_int_rem_128))
-
-_private_int_rem_105 := [?]DIGIT{
-	0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
-	0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
-	0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
-	1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
-	0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
-	1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
-	1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
-}
-#assert(105 * size_of(DIGIT) == size_of(_private_int_rem_105))
-
-_PRIME_TAB_SIZE :: 256
-_private_prime_table := [_PRIME_TAB_SIZE]DIGIT{
-	0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
-	0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
-	0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
-	0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083,
-	0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
-	0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
-	0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
-	0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
-
-	0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
-	0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
-	0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
-	0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
-	0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
-	0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
-	0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
-	0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
-
-	0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
-	0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
-	0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
-	0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
-	0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
-	0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
-	0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
-	0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
-
-	0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
-	0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
-	0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
-	0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
-	0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
-	0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
-	0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
-	0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653,
-}
-#assert(_PRIME_TAB_SIZE * size_of(DIGIT) == size_of(_private_prime_table))
-
-when MATH_BIG_FORCE_64_BIT || (!MATH_BIG_FORCE_32_BIT && size_of(rawptr) == 8) {
-	_factorial_table := [35]_WORD{
-/* f(00): */                                                     1,
-/* f(01): */                                                     1,
-/* f(02): */                                                     2,
-/* f(03): */                                                     6,
-/* f(04): */                                                    24,
-/* f(05): */                                                   120,
-/* f(06): */                                                   720,
-/* f(07): */                                                 5_040,
-/* f(08): */                                                40_320,
-/* f(09): */                                               362_880,
-/* f(10): */                                             3_628_800,
-/* f(11): */                                            39_916_800,
-/* f(12): */                                           479_001_600,
-/* f(13): */                                         6_227_020_800,
-/* f(14): */                                        87_178_291_200,
-/* f(15): */                                     1_307_674_368_000,
-/* f(16): */                                    20_922_789_888_000,
-/* f(17): */                                   355_687_428_096_000,
-/* f(18): */                                 6_402_373_705_728_000,
-/* f(19): */                               121_645_100_408_832_000,
-/* f(20): */                             2_432_902_008_176_640_000,
-/* f(21): */                            51_090_942_171_709_440_000,
-/* f(22): */                         1_124_000_727_777_607_680_000,
-/* f(23): */                        25_852_016_738_884_976_640_000,
-/* f(24): */                       620_448_401_733_239_439_360_000,
-/* f(25): */                    15_511_210_043_330_985_984_000_000,
-/* f(26): */                   403_291_461_126_605_635_584_000_000,
-/* f(27): */                10_888_869_450_418_352_160_768_000_000,
-/* f(28): */               304_888_344_611_713_860_501_504_000_000,
-/* f(29): */             8_841_761_993_739_701_954_543_616_000_000,
-/* f(30): */           265_252_859_812_191_058_636_308_480_000_000,
-/* f(31): */         8_222_838_654_177_922_817_725_562_880_000_000,
-/* f(32): */       263_130_836_933_693_530_167_218_012_160_000_000,
-/* f(33): */     8_683_317_618_811_886_495_518_194_401_280_000_000,
-/* f(34): */   295_232_799_039_604_140_847_618_609_643_520_000_000,
-	}
-} else {
-	_factorial_table := [21]_WORD{
-/* f(00): */                                                     1,
-/* f(01): */                                                     1,
-/* f(02): */                                                     2,
-/* f(03): */                                                     6,
-/* f(04): */                                                    24,
-/* f(05): */                                                   120,
-/* f(06): */                                                   720,
-/* f(07): */                                                 5_040,
-/* f(08): */                                                40_320,
-/* f(09): */                                               362_880,
-/* f(10): */                                             3_628_800,
-/* f(11): */                                            39_916_800,
-/* f(12): */                                           479_001_600,
-/* f(13): */                                         6_227_020_800,
-/* f(14): */                                        87_178_291_200,
-/* f(15): */                                     1_307_674_368_000,
-/* f(16): */                                    20_922_789_888_000,
-/* f(17): */                                   355_687_428_096_000,
-/* f(18): */                                 6_402_373_705_728_000,
-/* f(19): */                               121_645_100_408_832_000,
-/* f(20): */                             2_432_902_008_176_640_000,
-	}
-}
-
-/*
-	=========================  End of private tables  ========================
+/*
+	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
+	Made available under Odin's BSD-3 license.
+
+	An arbitrary precision mathematics implementation in Odin.
+	For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
+	The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
+
+	=============================    Private procedures    =============================
+
+	Private procedures used by the above low-level routines follow.
+
+	Don't call these yourself unless you really know what you're doing.
+	They include implementations that are optimimal for certain ranges of input only.
+
+	These aren't exported for the same reasons.
+*/
+
+
+package math_big
+
+import "base:intrinsics"
+import "core:mem"
+
+/*
+	Multiplies |a| * |b| and only computes upto digs digits of result.
+	HAC pp. 595, Algorithm 14.12  Modified so you can control how
+	many digits of output are created.
+*/
+_private_int_mul :: proc(dest, a, b: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	/*
+		Can we use the fast multiplier?
+	*/
+	if digits < _WARRAY && min(a.used, b.used) < _MAX_COMBA {
+		return #force_inline _private_int_mul_comba(dest, a, b, digits)
+	}
+
+	/*
+		Set up temporary output `Int`, which we'll swap for `dest` when done.
+	*/
+
+	t := &Int{}
+
+	internal_grow(t, max(digits, _DEFAULT_DIGIT_COUNT)) or_return
+	t.used = digits
+
+	/*
+		Compute the digits of the product directly.
+	*/
+	pa := a.used
+	for ix := 0; ix < pa; ix += 1 {
+		/*
+			Limit ourselves to `digits` DIGITs of output.
+		*/
+		pb    := min(b.used, digits - ix)
+		carry := _WORD(0)
+		iy    := 0
+
+		/*
+			Compute the column of the output and propagate the carry.
+		*/
+		#no_bounds_check for iy = 0; iy < pb; iy += 1 {
+			/*
+				Compute the column as a _WORD.
+			*/
+			column := _WORD(t.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + carry
+
+			/*
+				The new column is the lower part of the result.
+			*/
+			t.digit[ix + iy] = DIGIT(column & _WORD(_MASK))
+
+			/*
+				Get the carry word from the result.
+			*/
+			carry = column >> _DIGIT_BITS
+		}
+		/*
+			Set carry if it is placed below digits
+		*/
+		if ix + iy < digits {
+			t.digit[ix + pb] = DIGIT(carry)
+		}
+	}
+
+	internal_swap(dest, t)
+	internal_destroy(t)
+	return internal_clamp(dest)
+}
+
+
+/*
+	Multiplication using the Toom-Cook 3-way algorithm.
+
+	Much more complicated than Karatsuba but has a lower asymptotic running time of O(N**1.464).
+	This algorithm is only particularly useful on VERY large inputs.
+	(We're talking 1000s of digits here...).
+
+	This file contains code from J. Arndt's book  "Matters Computational"
+	and the accompanying FXT-library with permission of the author.
+
+	Setup from:
+		Chung, Jaewook, and M. Anwar Hasan. "Asymmetric squaring formulae."
+		18th IEEE Symposium on Computer Arithmetic (ARITH'07). IEEE, 2007.
+
+	The interpolation from above needed one temporary variable more than the interpolation here:
+
+		Bodrato, Marco, and Alberto Zanoni. "What about Toom-Cook matrices optimality."
+		Centro Vito Volterra Universita di Roma Tor Vergata (2006)
+*/
+_private_int_mul_toom :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	S1, S2, T1, a0, a1, a2, b0, b1, b2 := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
+	defer internal_destroy(S1, S2, T1, a0, a1, a2, b0, b1, b2)
+
+	/*
+		Init temps.
+	*/
+	internal_init_multi(S1, S2, T1)             or_return
+
+	/*
+		B
+	*/
+	B := min(a.used, b.used) / 3
+
+	/*
+		a = a2 * x^2 + a1 * x + a0;
+	*/
+	internal_grow(a0, B)                        or_return
+	internal_grow(a1, B)                        or_return
+	internal_grow(a2, a.used - 2 * B)           or_return
+
+	a0.used, a1.used = B, B
+	a2.used = a.used - 2 * B
+
+	internal_copy_digits(a0, a, a0.used)        or_return
+	internal_copy_digits(a1, a, a1.used, B)     or_return
+	internal_copy_digits(a2, a, a2.used, 2 * B) or_return
+
+	internal_clamp(a0)
+	internal_clamp(a1)
+	internal_clamp(a2)
+
+	/*
+		b = b2 * x^2 + b1 * x + b0;
+	*/
+	internal_grow(b0, B)                        or_return
+	internal_grow(b1, B)                        or_return
+	internal_grow(b2, b.used - 2 * B)           or_return
+
+	b0.used, b1.used = B, B
+	b2.used = b.used - 2 * B
+
+	internal_copy_digits(b0, b, b0.used)        or_return
+	internal_copy_digits(b1, b, b1.used, B)     or_return
+	internal_copy_digits(b2, b, b2.used, 2 * B) or_return
+
+	internal_clamp(b0)
+	internal_clamp(b1)
+	internal_clamp(b2)
+
+
+	/*
+		\\ S1 = (a2+a1+a0) * (b2+b1+b0);
+	*/
+	internal_add(T1, a2, a1)                    or_return /*   T1 = a2 + a1; */
+	internal_add(S2, T1, a0)                    or_return /*   S2 = T1 + a0; */
+	internal_add(dest, b2, b1)                  or_return /* dest = b2 + b1; */
+	internal_add(S1, dest, b0)                  or_return /*   S1 =  c + b0; */
+	internal_mul(S1, S1, S2)                    or_return /*   S1 = S1 * S2; */
+
+	/*
+		\\S2 = (4*a2+2*a1+a0) * (4*b2+2*b1+b0);
+	*/
+	internal_add(T1, T1, a2)                    or_return /*   T1 = T1 + a2; */
+	internal_int_shl1(T1, T1)                   or_return /*   T1 = T1 << 1; */
+	internal_add(T1, T1, a0)                    or_return /*   T1 = T1 + a0; */
+	internal_add(dest, dest, b2)                or_return /*    c =  c + b2; */
+	internal_int_shl1(dest, dest)               or_return /*    c =  c << 1; */
+	internal_add(dest, dest, b0)                or_return /*    c =  c + b0; */
+	internal_mul(S2, T1, dest)                  or_return /*   S2 = T1 *  c; */
+
+	/*
+		\\S3 = (a2-a1+a0) * (b2-b1+b0);
+	*/
+	internal_sub(a1, a2, a1)                    or_return /*   a1 = a2 - a1; */
+	internal_add(a1, a1, a0)                    or_return /*   a1 = a1 + a0; */
+	internal_sub(b1, b2, b1)                    or_return /*   b1 = b2 - b1; */
+	internal_add(b1, b1, b0)                    or_return /*   b1 = b1 + b0; */
+	internal_mul(a1, a1, b1)                    or_return /*   a1 = a1 * b1; */
+	internal_mul(b1, a2, b2)                    or_return /*   b1 = a2 * b2; */
+
+	/*
+		\\S2 = (S2 - S3) / 3;
+	*/
+	internal_sub(S2, S2, a1)                    or_return /*   S2 = S2 - a1; */
+	_private_int_div_3(S2, S2)                  or_return /*   S2 = S2 / 3; \\ this is an exact division  */
+	internal_sub(a1, S1, a1)                    or_return /*   a1 = S1 - a1; */
+	internal_int_shr1(a1, a1)                   or_return /*   a1 = a1 >> 1; */
+	internal_mul(a0, a0, b0)                    or_return /*   a0 = a0 * b0; */
+	internal_sub(S1, S1, a0)                    or_return /*   S1 = S1 - a0; */
+	internal_sub(S2, S2, S1)                    or_return /*   S2 = S2 - S1; */
+	internal_int_shr1(S2, S2)                   or_return /*   S2 = S2 >> 1; */
+	internal_sub(S1, S1, a1)                    or_return /*   S1 = S1 - a1; */
+	internal_sub(S1, S1, b1)                    or_return /*   S1 = S1 - b1; */
+	internal_int_shl1(T1, b1)                   or_return /*   T1 = b1 << 1; */
+	internal_sub(S2, S2, T1)                    or_return /*   S2 = S2 - T1; */
+	internal_sub(a1, a1, S2)                    or_return /*   a1 = a1 - S2; */
+
+	/*
+		P = b1*x^4+ S2*x^3+ S1*x^2+ a1*x + a0;
+	*/
+	_private_int_shl_leg(b1, 4 * B)             or_return
+	_private_int_shl_leg(S2, 3 * B)             or_return
+	internal_add(b1, b1, S2)                    or_return
+	_private_int_shl_leg(S1, 2 * B)             or_return
+	internal_add(b1, b1, S1)                    or_return
+	_private_int_shl_leg(a1, 1 * B)             or_return
+	internal_add(b1, b1, a1)                    or_return
+	internal_add(dest, b1, a0)                  or_return
+
+	/*
+		a * b - P
+	*/
+	return nil
+}
+
+/*
+	product = |a| * |b| using Karatsuba Multiplication using three half size multiplications.
+
+	Let `B` represent the radix [e.g. 2**_DIGIT_BITS] and let `n` represent
+	half of the number of digits in the min(a,b)
+
+	`a` = `a1` * `B`**`n` + `a0`
+	`b` = `b`1 * `B`**`n` + `b0`
+
+	Then, a * b => 1b1 * B**2n + ((a1 + a0)(b1 + b0) - (a0b0 + a1b1)) * B + a0b0
+
+	Note that a1b1 and a0b0 are used twice and only need to be computed once.
+	So in total three half size (half # of digit) multiplications are performed,
+		a0b0, a1b1 and (a1+b1)(a0+b0)
+
+	Note that a multiplication of half the digits requires 1/4th the number of
+	single precision multiplications, so in total after one call 25% of the
+	single precision multiplications are saved.
+
+	Note also that the call to `internal_mul` can end up back in this function
+	if the a0, a1, b0, or b1 are above the threshold.
+
+	This is known as divide-and-conquer and leads to the famous O(N**lg(3)) or O(N**1.584)
+	work which is asymptopically lower than the standard O(N**2) that the
+	baseline/comba methods use. Generally though, the overhead of this method doesn't pay off
+	until a certain size is reached, of around 80 used DIGITs.
+*/
+_private_int_mul_karatsuba :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	x0, x1, y0, y1, t1, x0y0, x1y1 := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
+	defer internal_destroy(x0, x1, y0, y1, t1, x0y0, x1y1)
+
+	/*
+		min # of digits, divided by two.
+	*/
+	B := min(a.used, b.used) >> 1
+
+	/*
+		Init all the temps.
+	*/
+	internal_grow(x0, B)          or_return
+	internal_grow(x1, a.used - B) or_return
+	internal_grow(y0, B)          or_return
+	internal_grow(y1, b.used - B) or_return
+	internal_grow(t1, B * 2)      or_return
+	internal_grow(x0y0, B * 2)    or_return
+	internal_grow(x1y1, B * 2)    or_return
+
+	/*
+		Now shift the digits.
+	*/
+	x0.used, y0.used = B, B
+	x1.used = a.used - B
+	y1.used = b.used - B
+
+	/*
+		We copy the digits directly instead of using higher level functions
+		since we also need to shift the digits.
+	*/
+	internal_copy_digits(x0, a, x0.used)
+	internal_copy_digits(y0, b, y0.used)
+	internal_copy_digits(x1, a, x1.used, B)
+	internal_copy_digits(y1, b, y1.used, B)
+
+	/*
+		Only need to clamp the lower words since by definition the
+		upper words x1/y1 must have a known number of digits.
+	*/
+	clamp(x0)
+	clamp(y0)
+
+	/*
+		Now calc the products x0y0 and x1y1,
+		after this x0 is no longer required, free temp [x0==t2]!
+	*/
+	internal_mul(x0y0, x0, y0)      or_return /* x0y0 = x0*y0 */
+	internal_mul(x1y1, x1, y1)      or_return /* x1y1 = x1*y1 */
+	internal_add(t1,   x1, x0)      or_return /* now calc x1+x0 and */
+	internal_add(x0,   y1, y0)      or_return /* t2 = y1 + y0 */
+	internal_mul(t1,   t1, x0)      or_return /* t1 = (x1 + x0) * (y1 + y0) */
+
+	/*
+		Add x0y0.
+	*/
+	internal_add(x0, x0y0, x1y1)    or_return /* t2 = x0y0 + x1y1 */
+	internal_sub(t1,   t1,   x0)    or_return /* t1 = (x1+x0)*(y1+y0) - (x1y1 + x0y0) */
+
+	/*
+		shift by B.
+	*/
+	_private_int_shl_leg(t1, B)       or_return /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
+	_private_int_shl_leg(x1y1, B * 2) or_return /* x1y1 = x1y1 << 2*B */
+
+	internal_add(t1, x0y0, t1)      or_return /* t1 = x0y0 + t1 */
+	internal_add(dest, t1, x1y1)    or_return /* t1 = x0y0 + t1 + x1y1 */
+
+	return nil
+}
+
+
+
+/*
+	Fast (comba) multiplier
+
+	This is the fast column-array [comba] multiplier.  It is
+	designed to compute the columns of the product first
+	then handle the carries afterwards.  This has the effect
+	of making the nested loops that compute the columns very
+	simple and schedulable on super-scalar processors.
+
+	This has been modified to produce a variable number of
+	digits of output so if say only a half-product is required
+	you don't have to compute the upper half (a feature
+	required for fast Barrett reduction).
+
+	Based on Algorithm 14.12 on pp.595 of HAC.
+*/
+_private_int_mul_comba :: proc(dest, a, b: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	/*
+		Set up array.
+	*/
+	W: [_WARRAY]DIGIT = ---
+
+	/*
+		Grow the destination as required.
+	*/
+	internal_grow(dest, digits) or_return
+
+	/*
+		Number of output digits to produce.
+	*/
+	pa := min(digits, a.used + b.used)
+
+	/*
+		Clear the carry
+	*/
+	_W := _WORD(0)
+
+	ix: int
+	for ix = 0; ix < pa; ix += 1 {
+		tx, ty, iy, iz: int
+
+		/*
+			Get offsets into the two bignums.
+		*/
+		ty = min(b.used - 1, ix)
+		tx = ix - ty
+
+		/*
+			This is the number of times the loop will iterate, essentially.
+			while (tx++ < a->used && ty-- >= 0) { ... }
+		*/
+		 
+		iy = min(a.used - tx, ty + 1)
+
+		/*
+			Execute loop.
+		*/
+		#no_bounds_check for iz = 0; iz < iy; iz += 1 {
+			_W += _WORD(a.digit[tx + iz]) * _WORD(b.digit[ty - iz])
+		}
+
+		/*
+			Store term.
+		*/
+		W[ix] = DIGIT(_W) & _MASK
+
+		/*
+			Make next carry.
+		*/
+		_W = _W >> _WORD(_DIGIT_BITS)
+	}
+
+	/*
+		Setup dest.
+	*/
+	old_used := dest.used
+	dest.used = pa
+
+	/*
+		Now extract the previous digit [below the carry].
+	*/
+	copy_slice(dest.digit[0:], W[:pa])	
+
+	/*
+		Clear unused digits [that existed in the old copy of dest].
+	*/
+	internal_zero_unused(dest, old_used)
+
+	/*
+		Adjust dest.used based on leading zeroes.
+	*/
+
+	return internal_clamp(dest)
+}
+
+/*
+	Multiplies |a| * |b| and does not compute the lower digs digits
+	[meant to get the higher part of the product]
+*/
+_private_int_mul_high :: proc(dest, a, b: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	/*
+		Can we use the fast multiplier?
+	*/
+	if a.used + b.used + 1 < _WARRAY && min(a.used, b.used) < _MAX_COMBA {
+		return _private_int_mul_high_comba(dest, a, b, digits)
+	}
+
+	internal_grow(dest, a.used + b.used + 1) or_return
+	dest.used = a.used + b.used + 1
+
+	pa := a.used
+	pb := b.used
+	for ix := 0; ix < pa; ix += 1 {
+		carry := DIGIT(0)
+
+		for iy := digits - ix; iy < pb; iy += 1 {
+			/*
+				Calculate the double precision result.
+			*/
+			r := _WORD(dest.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + _WORD(carry)
+
+			/*
+				Get the lower part.
+			*/
+			dest.digit[ix + iy] = DIGIT(r & _WORD(_MASK))
+
+			/*
+				Carry the carry.
+			*/
+			carry = DIGIT(r >> _WORD(_DIGIT_BITS))
+		}
+		dest.digit[ix + pb] = carry
+	}
+	return internal_clamp(dest)
+}
+
+/*
+	This is a modified version of `_private_int_mul_comba` that only produces output digits *above* `digits`.
+	See the comments for `_private_int_mul_comba` to see how it works.
+
+	This is used in the Barrett reduction since for one of the multiplications
+	only the higher digits were needed.  This essentially halves the work.
+
+	Based on Algorithm 14.12 on pp.595 of HAC.
+*/
+_private_int_mul_high_comba :: proc(dest, a, b: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	W: [_WARRAY]DIGIT = ---
+	_W: _WORD = 0
+
+	/*
+		Number of output digits to produce. Grow the destination as required.
+	*/
+	pa := a.used + b.used
+	internal_grow(dest, pa) or_return
+
+	ix: int
+	for ix = digits; ix < pa; ix += 1 {
+		/*
+			Get offsets into the two bignums.
+		*/
+		ty := min(b.used - 1, ix)
+		tx := ix - ty
+
+		/*
+			This is the number of times the loop will iterrate, essentially it's
+			while (tx++ < a->used && ty-- >= 0) { ... }
+		*/
+		iy := min(a.used - tx, ty + 1)
+
+		/*
+			Execute loop.
+		*/
+		for iz := 0; iz < iy; iz += 1 {
+			_W += _WORD(a.digit[tx + iz]) * _WORD(b.digit[ty - iz])
+		}
+
+		/*
+			Store term.
+		*/
+		W[ix] = DIGIT(_W) & DIGIT(_MASK)
+
+		/*
+			Make next carry.
+		*/
+		_W = _W >> _WORD(_DIGIT_BITS)
+	}
+
+	/*
+		Setup dest
+	*/
+	old_used := dest.used
+	dest.used = pa
+
+	for ix = digits; ix < pa; ix += 1 {
+		/*
+			Now extract the previous digit [below the carry].
+		*/
+		dest.digit[ix] = W[ix]
+	}
+
+	/*
+		Zero remainder.
+	*/
+	internal_zero_unused(dest, old_used)
+
+	/*
+		Adjust dest.used based on leading zeroes.
+	*/
+	return internal_clamp(dest)
+}
+
+/*
+	Single-digit multiplication with the smaller number as the single-digit.
+*/
+_private_int_mul_balance :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+	a, b := a, b
+
+	a0, tmp, r := &Int{}, &Int{}, &Int{}
+	defer internal_destroy(a0, tmp, r)
+
+	b_size   := min(a.used, b.used)
+	n_blocks := max(a.used, b.used) / b_size
+
+	internal_grow(a0, b_size + 2) or_return
+	internal_init_multi(tmp, r)   or_return
+
+	/*
+		Make sure that `a` is the larger one.
+	*/
+	if a.used < b.used {
+		a, b = b, a
+	}
+	assert(a.used >= b.used)
+
+	i, j := 0, 0
+	for ; i < n_blocks; i += 1 {
+		/*
+			Cut a slice off of `a`.
+		*/
+
+		a0.used = b_size
+		internal_copy_digits(a0, a, a0.used, j)
+		j += a0.used
+		internal_clamp(a0)
+
+		/*
+			Multiply with `b`.
+		*/
+		internal_mul(tmp, a0, b)                                     or_return
+
+		/*
+			Shift `tmp` to the correct position.
+		*/
+		_private_int_shl_leg(tmp, b_size * i)                          or_return
+
+		/*
+			Add to output. No carry needed.
+		*/
+		internal_add(r, r, tmp)                                      or_return
+	}
+
+	/*
+		The left-overs; there are always left-overs.
+	*/
+	if j < a.used {
+		a0.used = a.used - j
+		internal_copy_digits(a0, a, a0.used, j)
+		j += a0.used
+		internal_clamp(a0)
+
+		internal_mul(tmp, a0, b)                                     or_return
+		_private_int_shl_leg(tmp, b_size * i)                          or_return
+		internal_add(r, r, tmp)                                      or_return
+	}
+
+	internal_swap(dest, r)
+	return
+}
+
+/*
+	Low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16
+	Assumes `dest` and `src` to not be `nil`, and `src` to have been initialized.
+*/
+_private_int_sqr :: proc(dest, src: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+	pa := src.used
+
+	t := &Int{}; ix, iy: int
+	/*
+		Grow `t` to maximum needed size, or `_DEFAULT_DIGIT_COUNT`, whichever is bigger.
+	*/
+	internal_grow(t, max((2 * pa) + 1, _DEFAULT_DIGIT_COUNT)) or_return
+	t.used = (2 * pa) + 1
+
+	#no_bounds_check for ix = 0; ix < pa; ix += 1 {
+		carry := DIGIT(0)
+		/*
+			First calculate the digit at 2*ix; calculate double precision result.
+		*/
+		r := _WORD(t.digit[ix+ix]) + (_WORD(src.digit[ix]) * _WORD(src.digit[ix]))
+
+		/*
+			Store lower part in result.
+		*/
+		t.digit[ix+ix] = DIGIT(r & _WORD(_MASK))
+		/*
+			Get the carry.
+		*/
+		carry = DIGIT(r >> _DIGIT_BITS)
+
+		#no_bounds_check for iy = ix + 1; iy < pa; iy += 1 {
+			/*
+				First calculate the product.
+			*/
+			r = _WORD(src.digit[ix]) * _WORD(src.digit[iy])
+
+			/* Now calculate the double precision result. Nóte we use
+			 * addition instead of *2 since it's easier to optimize
+			 */
+			r = _WORD(t.digit[ix+iy]) + r + r + _WORD(carry)
+
+			/*
+				Store lower part.
+			*/
+			t.digit[ix+iy] = DIGIT(r & _WORD(_MASK))
+
+			/*
+				Get carry.
+			*/
+			carry = DIGIT(r >> _DIGIT_BITS)
+		}
+		/*
+			Propagate upwards.
+		*/
+		#no_bounds_check for carry != 0 {
+			r     = _WORD(t.digit[ix+iy]) + _WORD(carry)
+			t.digit[ix+iy] = DIGIT(r & _WORD(_MASK))
+			carry = DIGIT(r >> _WORD(_DIGIT_BITS))
+			iy += 1
+		}
+	}
+
+	err = internal_clamp(t)
+	internal_swap(dest, t)
+	internal_destroy(t)
+	return err
+}
+
+/*
+	The jist of squaring...
+	You do like mult except the offset of the tmpx [one that starts closer to zero] can't equal the offset of tmpy.
+	So basically you set up iy like before then you min it with (ty-tx) so that it never happens.
+	You double all those you add in the inner loop. After that loop you do the squares and add them in.
+
+	Assumes `dest` and `src` not to be `nil` and `src` to have been initialized.	
+*/
+_private_int_sqr_comba :: proc(dest, src: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	W: [_WARRAY]DIGIT = ---
+
+	/*
+		Grow the destination as required.
+	*/
+	pa := uint(src.used) + uint(src.used)
+	internal_grow(dest, int(pa)) or_return
+
+	/*
+		Number of output digits to produce.
+	*/
+	W1 := _WORD(0)
+	_W  : _WORD = ---
+	ix := uint(0)
+
+	#no_bounds_check for ; ix < pa; ix += 1 {
+		/*
+			Clear counter.
+		*/
+		_W = {}
+
+		/*
+			Get offsets into the two bignums.
+		*/
+		ty := min(uint(src.used) - 1, ix)
+		tx := ix - ty
+
+		/*
+			This is the number of times the loop will iterate,
+			essentially while (tx++ < a->used && ty-- >= 0) { ... }
+		*/
+		iy := min(uint(src.used) - tx, ty + 1)
+
+		/*
+			Now for squaring, tx can never equal ty.
+			We halve the distance since they approach at a rate of 2x,
+			and we have to round because odd cases need to be executed.
+		*/
+		iy = min(iy, ((ty - tx) + 1) >> 1 )
+
+		/*
+			Execute loop.
+		*/
+		#no_bounds_check for iz := uint(0); iz < iy; iz += 1 {
+			_W += _WORD(src.digit[tx + iz]) * _WORD(src.digit[ty - iz])
+		}
+
+		/*
+			Double the inner product and add carry.
+		*/
+		_W = _W + _W + W1
+
+		/*
+			Even columns have the square term in them.
+		*/
+		if ix & 1 == 0 {
+			_W += _WORD(src.digit[ix >> 1]) * _WORD(src.digit[ix >> 1])
+		}
+
+		/*
+			Store it.
+		*/
+		W[ix] = DIGIT(_W & _WORD(_MASK))
+
+		/*
+			Make next carry.
+		*/
+		W1 = _W >> _DIGIT_BITS
+	}
+
+	/*
+		Setup dest.
+	*/
+	old_used := dest.used
+	dest.used = src.used + src.used
+
+	#no_bounds_check for ix = 0; ix < pa; ix += 1 {
+		dest.digit[ix] = W[ix] & _MASK
+	}
+
+	/*
+		Clear unused digits [that existed in the old copy of dest].
+	*/
+	internal_zero_unused(dest, old_used)
+
+	return internal_clamp(dest)
+}
+
+/*
+	Karatsuba squaring, computes `dest` = `src` * `src` using three half-size squarings.
+ 
+ 	See comments of `_private_int_mul_karatsuba` for details.
+ 	It is essentially the same algorithm but merely tuned to perform recursive squarings.
+*/
+_private_int_sqr_karatsuba :: proc(dest, src: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	x0, x1, t1, t2, x0x0, x1x1 := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
+	defer internal_destroy(x0, x1, t1, t2, x0x0, x1x1)
+
+	/*
+		Min # of digits, divided by two.
+	*/
+	B := src.used >> 1
+
+	/*
+		Init temps.
+	*/
+	internal_grow(x0,   B) or_return
+	internal_grow(x1,   src.used - B) or_return
+	internal_grow(t1,   src.used * 2) or_return
+	internal_grow(t2,   src.used * 2) or_return
+	internal_grow(x0x0, B * 2       ) or_return
+	internal_grow(x1x1, (src.used - B) * 2) or_return
+
+	/*
+		Now shift the digits.
+	*/
+	x0.used = B
+	x1.used = src.used - B
+
+	#force_inline internal_copy_digits(x0, src, x0.used)
+	#force_inline mem.copy_non_overlapping(&x1.digit[0], &src.digit[B], size_of(DIGIT) * x1.used)
+	#force_inline internal_clamp(x0)
+
+	/*
+		Now calc the products x0*x0 and x1*x1.
+	*/
+	internal_sqr(x0x0, x0) or_return
+	internal_sqr(x1x1, x1) or_return
+
+	/*
+		Now calc (x1+x0)^2
+	*/
+	internal_add(t1, x0, x1) or_return
+	internal_sqr(t1, t1) or_return
+
+	/*
+		Add x0y0
+	*/
+	internal_add(t2, x0x0, x1x1) or_return
+	internal_sub(t1, t1, t2) or_return
+
+	/*
+		Shift by B.
+	*/
+	_private_int_shl_leg(t1, B) or_return
+	_private_int_shl_leg(x1x1, B * 2) or_return
+	internal_add(t1, t1, x0x0) or_return
+	internal_add(dest, t1, x1x1) or_return
+
+	return #force_inline internal_clamp(dest)
+}
+
+/*
+	Squaring using Toom-Cook 3-way algorithm.
+
+	Setup and interpolation from algorithm SQR_3 in Chung, Jaewook, and M. Anwar Hasan. "Asymmetric squaring formulae."
+	  18th IEEE Symposium on Computer Arithmetic (ARITH'07). IEEE, 2007.
+*/
+_private_int_sqr_toom :: proc(dest, src: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	S0, a0, a1, a2 := &Int{}, &Int{}, &Int{}, &Int{}
+	defer internal_destroy(S0, a0, a1, a2)
+
+	/*
+		Init temps.
+	*/
+	internal_zero(S0) or_return
+
+	/*
+		B
+	*/
+	B := src.used / 3
+
+	/*
+		a = a2 * x^2 + a1 * x + a0;
+	*/
+	internal_grow(a0, B) or_return
+	internal_grow(a1, B) or_return
+	internal_grow(a2, src.used - (2 * B)) or_return
+
+	a0.used = B
+	a1.used = B
+	a2.used = src.used - 2 * B
+
+	#force_inline mem.copy_non_overlapping(&a0.digit[0], &src.digit[    0], size_of(DIGIT) * a0.used)
+	#force_inline mem.copy_non_overlapping(&a1.digit[0], &src.digit[    B], size_of(DIGIT) * a1.used)
+	#force_inline mem.copy_non_overlapping(&a2.digit[0], &src.digit[2 * B], size_of(DIGIT) * a2.used)
+
+	internal_clamp(a0)
+	internal_clamp(a1)
+	internal_clamp(a2)
+
+	/** S0 = a0^2;  */
+	internal_sqr(S0, a0) or_return
+
+	/** \\S1 = (a2 + a1 + a0)^2 */
+	/** \\S2 = (a2 - a1 + a0)^2  */
+	/** \\S1 = a0 + a2; */
+	/** a0 = a0 + a2; */
+	internal_add(a0, a0, a2) or_return
+	/** \\S2 = S1 - a1; */
+	/** b = a0 - a1; */
+	internal_sub(dest, a0, a1) or_return
+	/** \\S1 = S1 + a1; */
+	/** a0 = a0 + a1; */
+	internal_add(a0, a0, a1) or_return
+	/** \\S1 = S1^2;  */
+	/** a0 = a0^2; */
+	internal_sqr(a0, a0) or_return
+	/** \\S2 = S2^2;  */
+	/** b = b^2; */
+	internal_sqr(dest, dest) or_return
+	/** \\ S3 = 2 * a1 * a2  */
+	/** \\S3 = a1 * a2;  */
+	/** a1 = a1 * a2; */
+	internal_mul(a1, a1, a2) or_return
+	/** \\S3 = S3 << 1;  */
+	/** a1 = a1 << 1; */
+	internal_shl(a1, a1, 1) or_return
+	/** \\S4 = a2^2;  */
+	/** a2 = a2^2; */
+	internal_sqr(a2, a2) or_return
+	/** \\ tmp = (S1 + S2)/2  */
+	/** \\tmp = S1 + S2; */
+	/** b = a0 + b; */
+	internal_add(dest, a0, dest) or_return
+	/** \\tmp = tmp >> 1; */
+	/** b = b >> 1; */
+	internal_shr(dest, dest, 1) or_return
+	/** \\ S1 = S1 - tmp - S3  */
+	/** \\S1 = S1 - tmp; */
+	/** a0 = a0 - b; */
+	internal_sub(a0, a0, dest) or_return
+	/** \\S1 = S1 - S3;  */
+	/** a0 = a0 - a1; */
+	internal_sub(a0, a0, a1) or_return
+	/** \\S2 = tmp - S4 -S0  */
+	/** \\S2 = tmp - S4;  */
+	/** b = b - a2; */
+	internal_sub(dest, dest, a2) or_return
+	/** \\S2 = S2 - S0;  */
+	/** b = b - S0; */
+	internal_sub(dest, dest, S0) or_return
+	/** \\P = S4*x^4 + S3*x^3 + S2*x^2 + S1*x + S0; */
+	/** P = a2*x^4 + a1*x^3 + b*x^2 + a0*x + S0; */
+	_private_int_shl_leg(  a2, 4 * B) or_return
+	_private_int_shl_leg(  a1, 3 * B) or_return
+	_private_int_shl_leg(dest, 2 * B) or_return
+	_private_int_shl_leg(  a0, 1 * B) or_return
+
+	internal_add(a2, a2, a1) or_return
+	internal_add(dest, dest, a2) or_return
+	internal_add(dest, dest, a0) or_return
+	internal_add(dest, dest, S0) or_return
+	/** a^2 - P  */
+
+	return #force_inline internal_clamp(dest)
+}
+
+/*
+	Divide by three (based on routine from MPI and the GMP manual).
+*/
+_private_int_div_3 :: proc(quotient, numerator: ^Int, allocator := context.allocator) -> (remainder: DIGIT, err: Error) {
+	context.allocator = allocator
+
+	/*
+		b = 2^_DIGIT_BITS / 3
+	*/
+ 	b := _WORD(1) << _WORD(_DIGIT_BITS) / _WORD(3)
+
+	q := &Int{}
+	internal_grow(q, numerator.used) or_return
+	q.used = numerator.used
+	q.sign = numerator.sign
+
+	w, t: _WORD
+	#no_bounds_check for ix := numerator.used; ix >= 0; ix -= 1 {
+		w = (w << _WORD(_DIGIT_BITS)) | _WORD(numerator.digit[ix])
+		if w >= 3 {
+			/*
+				Multiply w by [1/3].
+			*/
+			t = (w * b) >> _WORD(_DIGIT_BITS)
+
+			/*
+				Now subtract 3 * [w/3] from w, to get the remainder.
+			*/
+			w -= t+t+t
+
+			/*
+				Fixup the remainder as required since the optimization is not exact.
+			*/
+			for w >= 3 {
+				t += 1
+				w -= 3
+			}
+		} else {
+			t = 0
+		}
+		q.digit[ix] = DIGIT(t)
+	}
+	remainder = DIGIT(w)
+
+	/*
+		[optional] store the quotient.
+	*/
+	if quotient != nil {
+		err = clamp(q)
+ 		internal_swap(q, quotient)
+ 	}
+	internal_destroy(q)
+	return remainder, nil
+}
+
+/*
+	Signed Integer Division
+
+	c*b + d == a [i.e. a/b, c=quotient, d=remainder], HAC pp.598 Algorithm 14.20
+
+	Note that the description in HAC is horribly incomplete.
+	For example, it doesn't consider the case where digits are removed from 'x' in
+	the inner loop.
+
+	It also doesn't consider the case that y has fewer than three digits, etc.
+	The overall algorithm is as described as 14.20 from HAC but fixed to treat these cases.
+*/
+_private_int_div_school :: proc(quotient, remainder, numerator, denominator: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	error_if_immutable(quotient, remainder) or_return
+
+	q, x, y, t1, t2 := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
+	defer internal_destroy(q, x, y, t1, t2)
+
+	internal_grow(q, numerator.used + 2) or_return
+	q.used = numerator.used + 2
+
+	internal_init_multi(t1, t2) or_return
+	internal_copy(x, numerator) or_return
+	internal_copy(y, denominator) or_return
+
+	/*
+		Fix the sign.
+	*/
+	neg   := numerator.sign != denominator.sign
+	x.sign = .Zero_or_Positive
+	y.sign = .Zero_or_Positive
+
+	/*
+		Normalize both x and y, ensure that y >= b/2, [b == 2**MP_DIGIT_BIT]
+	*/
+	norm := internal_count_bits(y) % _DIGIT_BITS
+
+	if norm < _DIGIT_BITS - 1 {
+		norm = (_DIGIT_BITS - 1) - norm
+		internal_shl(x, x, norm) or_return
+		internal_shl(y, y, norm) or_return
+	} else {
+		norm = 0
+	}
+
+	/*
+		Note: HAC does 0 based, so if used==5 then it's 0,1,2,3,4, i.e. use 4
+	*/
+	n := x.used - 1
+	t := y.used - 1
+
+	/*
+		while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} }
+		y = y*b**{n-t}
+	*/
+
+	_private_int_shl_leg(y, n - t) or_return
+
+	gte := internal_gte(x, y)
+	for gte {
+		q.digit[n - t] += 1
+		internal_sub(x, x, y) or_return
+		gte = internal_gte(x, y)
+	}
+
+	/*
+		Reset y by shifting it back down.
+	*/
+	_private_int_shr_leg(y, n - t)
+
+	/*
+		Step 3. for i from n down to (t + 1).
+	*/
+	#no_bounds_check for i := n; i >= (t + 1); i -= 1 {
+		if i > x.used { continue }
+
+		/*
+			step 3.1 if xi == yt then set q{i-t-1} to b-1, otherwise set q{i-t-1} to (xi*b + x{i-1})/yt
+		*/
+		if x.digit[i] == y.digit[t] {
+			q.digit[(i - t) - 1] = 1 << (_DIGIT_BITS - 1)
+		} else {
+
+			tmp := _WORD(x.digit[i]) << _DIGIT_BITS
+			tmp |= _WORD(x.digit[i - 1])
+			tmp /= _WORD(y.digit[t])
+			if tmp > _WORD(_MASK) {
+				tmp = _WORD(_MASK)
+			}
+			q.digit[(i - t) - 1] = DIGIT(tmp & _WORD(_MASK))
+		}
+
+		/* while (q{i-t-1} * (yt * b + y{t-1})) >
+					xi * b**2 + xi-1 * b + xi-2
+
+			do q{i-t-1} -= 1;
+		*/
+
+		iter := 0
+
+		q.digit[(i - t) - 1] = (q.digit[(i - t) - 1] + 1) & _MASK
+		#no_bounds_check for {
+			q.digit[(i - t) - 1] = (q.digit[(i - t) - 1] - 1) & _MASK
+
+			/*
+				Find left hand.
+			*/
+			internal_zero(t1)
+			t1.digit[0] = ((t - 1) < 0) ? 0 : y.digit[t - 1]
+			t1.digit[1] = y.digit[t]
+			t1.used = 2
+			internal_mul(t1, t1, q.digit[(i - t) - 1]) or_return
+
+			/*
+				Find right hand.
+			*/
+			t2.digit[0] = ((i - 2) < 0) ? 0 : x.digit[i - 2]
+			t2.digit[1] = x.digit[i - 1] /* i >= 1 always holds */
+			t2.digit[2] = x.digit[i]
+			t2.used = 3
+
+			if internal_lte(t1, t2) {
+				break
+			}
+			iter += 1; if iter > 100 {
+				return .Max_Iterations_Reached
+			}
+		}
+
+		/*
+			Step 3.3 x = x - q{i-t-1} * y * b**{i-t-1}
+		*/
+		int_mul_digit(t1, y, q.digit[(i - t) - 1]) or_return
+		_private_int_shl_leg(t1, (i - t) - 1) or_return
+		internal_sub(x, x, t1) or_return
+
+		/*
+			if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; }
+		*/
+		if x.sign == .Negative {
+			internal_copy(t1, y) or_return
+			_private_int_shl_leg(t1, (i - t) - 1) or_return
+			internal_add(x, x, t1) or_return
+
+			q.digit[(i - t) - 1] = (q.digit[(i - t) - 1] - 1) & _MASK
+		}
+	}
+
+	/*
+		Now q is the quotient and x is the remainder, [which we have to normalize]
+		Get sign before writing to c.
+	*/
+	z, _ := is_zero(x)
+	x.sign = .Zero_or_Positive if z else numerator.sign
+
+	if quotient != nil {
+		internal_clamp(q)
+		internal_swap(q, quotient)
+		quotient.sign = .Negative if neg else .Zero_or_Positive
+	}
+
+	if remainder != nil {
+		internal_shr(x, x, norm) or_return
+		internal_swap(x, remainder)
+	}
+
+	return nil
+}
+
+/*
+	Direct implementation of algorithms 1.8 "RecursiveDivRem" and 1.9 "UnbalancedDivision" from:
+
+		Brent, Richard P., and Paul Zimmermann. "Modern computer arithmetic"
+		Vol. 18. Cambridge University Press, 2010
+		Available online at https://arxiv.org/pdf/1004.4710
+
+	pages 19ff. in the above online document.
+*/
+_private_div_recursion :: proc(quotient, remainder, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	A1, A2, B1, B0, Q1, Q0, R1, R0, t := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
+	defer internal_destroy(A1, A2, B1, B0, Q1, Q0, R1, R0, t)
+
+	m := a.used - b.used
+	k := m / 2
+
+	if m < MUL_KARATSUBA_CUTOFF {
+		return _private_int_div_school(quotient, remainder, a, b)
+	}
+
+	internal_init_multi(A1, A2, B1, B0, Q1, Q0, R1, R0, t) or_return
+
+	/*
+		`B1` = `b` / `beta`^`k`, `B0` = `b` % `beta`^`k`
+	*/
+	internal_shrmod(B1, B0, b, k * _DIGIT_BITS) or_return
+
+	/*
+		(Q1, R1) =  RecursiveDivRem(A / beta^(2k), B1)
+	*/
+	internal_shrmod(A1, t, a, 2 * k * _DIGIT_BITS) or_return
+	_private_div_recursion(Q1, R1, A1, B1) or_return
+
+	/*
+		A1 = (R1 * beta^(2k)) + (A % beta^(2k)) - (Q1 * B0 * beta^k)
+	*/
+	_private_int_shl_leg(R1, 2 * k) or_return
+	internal_add(A1, R1, t) or_return
+	internal_mul(t, Q1, B0) or_return
+
+	/*
+		While A1 < 0 do Q1 = Q1 - 1, A1 = A1 + (beta^k * B)
+	*/
+	if internal_lt(A1, 0) {
+		internal_shl(t, b, k * _DIGIT_BITS) or_return
+
+		for {
+			internal_decr(Q1) or_return
+			internal_add(A1, A1, t) or_return
+			if internal_gte(A1, 0) { break }
+		}
+	}
+
+	/*
+		(Q0, R0) =  RecursiveDivRem(A1 / beta^(k), B1)
+	*/
+	internal_shrmod(A1, t, A1, k * _DIGIT_BITS) or_return
+	_private_div_recursion(Q0, R0, A1, B1) or_return
+
+	/*
+		A2 = (R0*beta^k) +  (A1 % beta^k) - (Q0*B0)
+	*/
+	_private_int_shl_leg(R0, k) or_return
+	internal_add(A2, R0, t) or_return
+	internal_mul(t, Q0, B0) or_return
+	internal_sub(A2, A2, t) or_return
+
+	/*
+		While A2 < 0 do Q0 = Q0 - 1, A2 = A2 + B.
+	*/
+	for internal_is_negative(A2) { // internal_lt(A2, 0) {
+		internal_decr(Q0) or_return
+		internal_add(A2, A2, b) or_return
+	}
+
+	/*
+		Return q = (Q1*beta^k) + Q0, r = A2.
+	*/
+	_private_int_shl_leg(Q1, k) or_return
+	internal_add(quotient, Q1, Q0) or_return
+
+	return internal_copy(remainder, A2)
+}
+
+_private_int_div_recursive :: proc(quotient, remainder, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	A, B, Q, Q1, R, A_div, A_mod := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
+	defer internal_destroy(A, B, Q, Q1, R, A_div, A_mod)
+
+	internal_init_multi(A, B, Q, Q1, R, A_div, A_mod) or_return
+
+	/*
+		Most significant bit of a limb.
+		Assumes  _DIGIT_MAX < (sizeof(DIGIT) * sizeof(u8)).
+	*/
+	msb := (_DIGIT_MAX + DIGIT(1)) >> 1
+	sigma := 0
+	msb_b := b.digit[b.used - 1]
+	for msb_b < msb {
+		sigma += 1
+		msb_b <<= 1
+	}
+
+	/*
+		Use that sigma to normalize B.
+	*/
+	internal_shl(B, b, sigma) or_return
+	internal_shl(A, a, sigma) or_return
+
+	/*
+		Fix the sign.
+	*/
+	neg := a.sign != b.sign
+	A.sign = .Zero_or_Positive; B.sign = .Zero_or_Positive
+
+	/*
+		If the magnitude of "A" is not more more than twice that of "B" we can work
+		on them directly, otherwise we need to work at "A" in chunks.
+	*/
+	n := B.used
+	m := A.used - B.used
+
+	/*
+		Q = 0. We already ensured that when we called `internal_init_multi`.
+	*/
+	for m > n {
+		/*
+			(q, r) = RecursiveDivRem(A / (beta^(m-n)), B)
+		*/
+		j := (m - n) * _DIGIT_BITS
+		internal_shrmod(A_div, A_mod, A, j) or_return
+		_private_div_recursion(Q1, R, A_div, B) or_return
+
+		/*
+			Q = (Q*beta!(n)) + q
+		*/
+		internal_shl(Q, Q, n * _DIGIT_BITS) or_return
+		internal_add(Q, Q, Q1) or_return
+
+		/*
+			A = (r * beta^(m-n)) + (A % beta^(m-n))
+		*/
+		internal_shl(R, R, (m - n) * _DIGIT_BITS) or_return
+		internal_add(A, R, A_mod) or_return
+
+		/*
+			m = m - n
+		*/
+		m -= n
+	}
+
+	/*
+		(q, r) = RecursiveDivRem(A, B)
+	*/
+	_private_div_recursion(Q1, R, A, B) or_return
+
+	/*
+		Q = (Q * beta^m) + q, R = r
+	*/
+	internal_shl(Q, Q, m * _DIGIT_BITS) or_return
+	internal_add(Q, Q, Q1) or_return
+
+	/*
+		Get sign before writing to dest.
+	*/
+	R.sign = .Zero_or_Positive if internal_is_zero(Q) else a.sign
+
+	if quotient != nil {
+		swap(quotient, Q)
+		quotient.sign = .Negative if neg else .Zero_or_Positive
+	}
+	if remainder != nil {
+		/*
+			De-normalize the remainder.
+		*/
+		internal_shrmod(R, nil, R, sigma) or_return
+		swap(remainder, R)
+	}
+	return nil
+}
+
+/*
+	Slower bit-bang division... also smaller.
+*/
+@(deprecated="Use `_int_div_school`, it's 3.5x faster.")
+_private_int_div_small :: proc(quotient, remainder, numerator, denominator: ^Int) -> (err: Error) {
+
+	ta, tb, tq, q := &Int{}, &Int{}, &Int{}, &Int{}
+
+	defer internal_destroy(ta, tb, tq, q)
+
+	for {
+		internal_one(tq) or_return
+
+		num_bits, _ := count_bits(numerator)
+		den_bits, _ := count_bits(denominator)
+		n := num_bits - den_bits
+
+		abs(ta, numerator)   or_return
+		abs(tb, denominator) or_return
+		shl(tb, tb, n)       or_return
+		shl(tq, tq, n)       or_return
+
+		for n >= 0 {
+			if internal_gte(ta, tb) {
+				// ta -= tb
+				sub(ta, ta, tb) or_return
+				//  q += tq
+				add( q, q,  tq) or_return
+			}
+			shr1(tb, tb) or_return
+			shr1(tq, tq) or_return
+
+			n -= 1
+		}
+
+		/*
+			Now q == quotient and ta == remainder.
+		*/
+		neg := numerator.sign != denominator.sign
+		if quotient != nil {
+			swap(quotient, q)
+			z, _ := is_zero(quotient)
+			quotient.sign = .Negative if neg && !z else .Zero_or_Positive
+		}
+		if remainder != nil {
+			swap(remainder, ta)
+			z, _ := is_zero(numerator)
+			remainder.sign = .Zero_or_Positive if z else numerator.sign
+		}
+
+		break
+	}
+	return err
+}
+
+
+
+/*
+	Binary split factorial algo due to: http://www.luschny.de/math/factorial/binarysplitfact.html
+*/
+_private_int_factorial_binary_split :: proc(res: ^Int, n: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	inner, outer, start, stop, temp := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
+	defer internal_destroy(inner, outer, start, stop, temp)
+
+	internal_one(inner, false)                                       or_return
+	internal_one(outer, false)                                       or_return
+
+	bits_used := ilog2(n)
+
+	for i := bits_used; i >= 0; i -= 1 {
+		start := (n >> (uint(i) + 1)) + 1 | 1
+		stop  := (n >> uint(i)) + 1 | 1
+		_private_int_recursive_product(temp, start, stop, 0)         or_return
+		internal_mul(inner, inner, temp)                             or_return
+		internal_mul(outer, outer, inner)                            or_return
+	}
+	shift := n - intrinsics.count_ones(n)
+
+	return internal_shl(res, outer, int(shift))
+}
+
+/*
+	Recursive product used by binary split factorial algorithm.
+*/
+_private_int_recursive_product :: proc(res: ^Int, start, stop: int, level := int(0), allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	t1, t2 := &Int{}, &Int{}
+	defer internal_destroy(t1, t2)
+
+	if level > FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS {
+		return .Max_Iterations_Reached
+	}
+
+	num_factors := (stop - start) >> 1
+	if num_factors == 2 {
+		internal_set(t1, start, false)                               or_return
+		when true {
+			internal_grow(t2, t1.used + 1, false)                    or_return
+			internal_add(t2, t1, 2)                                  or_return
+		} else {
+			internal_add(t2, t1, 2)                                  or_return
+		}
+		return internal_mul(res, t1, t2)
+	}
+
+	if num_factors > 1 {
+		mid := (start + num_factors) | 1
+		_private_int_recursive_product(t1, start,  mid, level + 1)   or_return
+		_private_int_recursive_product(t2,   mid, stop, level + 1)   or_return
+		return internal_mul(res, t1, t2)
+	}
+
+	if num_factors == 1 {
+		return #force_inline internal_set(res, start, true)
+	}
+
+	return #force_inline internal_one(res, true)
+}
+
+/*
+	Internal function computing both GCD using the binary method,
+		and, if target isn't `nil`, also LCM.
+
+	Expects the `a` and `b` to have been initialized
+		and one or both of `res_gcd` or `res_lcm` not to be `nil`.
+
+	If both `a` and `b` are zero, return zero.
+	If either `a` or `b`, return the other one.
+
+	The `gcd` and `lcm` wrappers have already done this test,
+	but `gcd_lcm` wouldn't have, so we still need to perform it.
+
+	If neither result is wanted, we have nothing to do.
+*/
+_private_int_gcd_lcm :: proc(res_gcd, res_lcm, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	if res_gcd == nil && res_lcm == nil {
+		return nil
+	}
+
+	/*
+		We need a temporary because `res_gcd` is allowed to be `nil`.
+	*/
+	if a.used == 0 && b.used == 0 {
+		/*
+			GCD(0, 0) and LCM(0, 0) are both 0.
+		*/
+		if res_gcd != nil {
+			internal_zero(res_gcd) or_return
+		}
+		if res_lcm != nil {
+			internal_zero(res_lcm) or_return
+		}
+		return nil
+	} else if a.used == 0 {
+		/*
+			We can early out with GCD = B and LCM = 0
+		*/
+		if res_gcd != nil {
+			internal_abs(res_gcd, b) or_return
+		}
+		if res_lcm != nil {
+			internal_zero(res_lcm) or_return
+		}
+		return nil
+	} else if b.used == 0 {
+		/*
+			We can early out with GCD = A and LCM = 0
+		*/
+		if res_gcd != nil {
+			internal_abs(res_gcd, a) or_return
+		}
+		if res_lcm != nil {
+			internal_zero(res_lcm) or_return
+		}
+		return nil
+	}
+
+	temp_gcd_res := &Int{}
+	defer internal_destroy(temp_gcd_res)
+
+	/*
+		If neither `a` or `b` was zero, we need to compute `gcd`.
+ 		Get copies of `a` and `b` we can modify.
+ 	*/
+	u, v := &Int{}, &Int{}
+	defer internal_destroy(u, v)
+	internal_copy(u, a) or_return
+	internal_copy(v, b) or_return
+
+ 	/*
+ 		Must be positive for the remainder of the algorithm.
+ 	*/
+	u.sign = .Zero_or_Positive; v.sign = .Zero_or_Positive
+
+ 	/*
+ 		B1.  Find the common power of two for `u` and `v`.
+ 	*/
+ 	u_lsb, _ := internal_count_lsb(u)
+ 	v_lsb, _ := internal_count_lsb(v)
+ 	k        := min(u_lsb, v_lsb)
+
+	if k > 0 {
+		/*
+			Divide the power of two out.
+		*/
+		internal_shr(u, u, k) or_return
+		internal_shr(v, v, k) or_return
+	}
+
+	/*
+		Divide any remaining factors of two out.
+	*/
+	if u_lsb != k {
+		internal_shr(u, u, u_lsb - k) or_return
+	}
+	if v_lsb != k {
+		internal_shr(v, v, v_lsb - k) or_return
+	}
+
+	for v.used != 0 {
+		/*
+			Make sure `v` is the largest.
+		*/
+		if internal_gt(u, v) {
+			/*
+				Swap `u` and `v` to make sure `v` is >= `u`.
+			*/
+			internal_swap(u, v)
+		}
+
+		/*
+			Subtract smallest from largest.
+		*/
+		internal_sub(v, v, u) or_return
+
+		/*
+			Divide out all factors of two.
+		*/
+		b, _ := internal_count_lsb(v)
+		internal_shr(v, v, b) or_return
+	}
+
+ 	/*
+ 		Multiply by 2**k which we divided out at the beginning.
+ 	*/
+ 	internal_shl(temp_gcd_res, u, k) or_return
+ 	temp_gcd_res.sign = .Zero_or_Positive
+
+	/*
+		We've computed `gcd`, either the long way, or because one of the inputs was zero.
+		If we don't want `lcm`, we're done.
+	*/
+	if res_lcm == nil {
+		internal_swap(temp_gcd_res, res_gcd)
+		return nil
+	}
+
+	/*
+		Computes least common multiple as `|a*b|/gcd(a,b)`
+		Divide the smallest by the GCD.
+	*/
+	if internal_lt_abs(a, b) {
+		/*
+			Store quotient in `t2` such that `t2 * b` is the LCM.
+		*/
+		internal_div(res_lcm, a, temp_gcd_res) or_return
+		err = internal_mul(res_lcm, res_lcm, b)
+	} else {
+		/*
+			Store quotient in `t2` such that `t2 * a` is the LCM.
+		*/
+		internal_div(res_lcm, b, temp_gcd_res) or_return
+		err = internal_mul(res_lcm, res_lcm, a)
+	}
+
+	if res_gcd != nil {
+		internal_swap(temp_gcd_res, res_gcd)
+	}
+
+	/*
+		Fix the sign to positive and return.
+	*/
+	res_lcm.sign = .Zero_or_Positive
+	return err
+}
+
+/*
+	Internal implementation of log.
+	Assumes `a` not to be `nil` and to have been initialized.
+*/
+_private_int_log :: proc(a: ^Int, base: DIGIT, allocator := context.allocator) -> (res: int, err: Error) {
+	bracket_low, bracket_high, bracket_mid, t, bi_base := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
+	defer internal_destroy(bracket_low, bracket_high, bracket_mid, t, bi_base)
+
+	ic := #force_inline internal_cmp(a, base)
+	if ic == -1 || ic == 0 {
+		return 1 if ic == 0 else 0, nil
+	}
+	defer if err != nil {
+		res = -1
+	}
+
+	internal_set(bi_base, base, true, allocator)       or_return
+	internal_clear(bracket_mid, false, allocator)      or_return
+	internal_clear(t, false, allocator)                or_return
+	internal_one(bracket_low, false, allocator)        or_return
+	internal_set(bracket_high, base, false, allocator) or_return
+
+	low := 0; high := 1
+
+	/*
+		A kind of Giant-step/baby-step algorithm.
+		Idea shamelessly stolen from https://programmingpraxis.com/2010/05/07/integer-logarithms/2/
+		The effect is asymptotic, hence needs benchmarks to test if the Giant-step should be skipped
+		for small n.
+	*/
+
+	for {
+		/*
+			Iterate until `a` is bracketed between low + high.
+		*/
+		if #force_inline internal_gte(bracket_high, a) { break }
+
+		low = high
+		#force_inline internal_copy(bracket_low, bracket_high) or_return
+		high <<= 1
+		#force_inline internal_sqr(bracket_high, bracket_high) or_return
+	}
+
+	for (high - low) > 1 {
+		mid := (high + low) >> 1
+
+		#force_inline internal_pow(t, bi_base, mid - low) or_return
+
+		#force_inline internal_mul(bracket_mid, bracket_low, t) or_return
+
+		mc := #force_inline internal_cmp(a, bracket_mid)
+		switch mc {
+		case -1:
+			high = mid
+			internal_swap(bracket_mid, bracket_high)
+		case  0:
+			return mid, nil
+		case  1:
+			low = mid
+			internal_swap(bracket_mid, bracket_low)
+		}
+	}
+
+	fc := #force_inline internal_cmp(bracket_high, a)
+	res = high if fc == 0 else low
+
+	return
+}
+
+/*
+	Computes xR**-1 == x (mod N) via Montgomery Reduction.
+	This is an optimized implementation of `internal_montgomery_reduce`
+	which uses the comba method to quickly calculate the columns of the reduction.
+	Based on Algorithm 14.32 on pp.601 of HAC.
+*/
+_private_montgomery_reduce_comba :: proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+	W: [_WARRAY]_WORD = ---
+
+	if x.used > _WARRAY { return .Invalid_Argument }
+
+	/*
+		Get old used count.
+	*/
+	old_used := x.used
+
+	/*
+		Grow `x` as required.
+	*/
+	internal_grow(x, n.used + 1)                                     or_return
+
+	/*
+		First we have to get the digits of the input into an array of double precision words W[...]
+		Copy the digits of `x` into W[0..`x.used` - 1]
+	*/
+	ix: int
+	for ix = 0; ix < x.used; ix += 1 {
+		W[ix] = _WORD(x.digit[ix])
+	}
+
+	/*
+		Zero the high words of W[a->used..m->used*2].
+	*/
+	zero_upper := (n.used * 2) + 1
+	if ix < zero_upper {
+		for ix = x.used; ix < zero_upper; ix += 1 {
+			W[ix] = {}
+		}
+	}
+
+	/*
+		Now we proceed to zero successive digits from the least significant upwards.
+	*/
+	for ix = 0; ix < n.used; ix += 1 {
+		/*
+			`mu = ai * m' mod b`
+
+			We avoid a double precision multiplication (which isn't required)
+			by casting the value down to a DIGIT.  Note this requires
+			that W[ix-1] have the carry cleared (see after the inner loop)
+		*/
+		mu := ((W[ix] & _WORD(_MASK)) * _WORD(rho)) & _WORD(_MASK)
+
+		/*
+			`a = a + mu * m * b**i`
+		
+			This is computed in place and on the fly.  The multiplication
+		 	by b**i is handled by offseting which columns the results
+		 	are added to.
+		
+			Note the comba method normally doesn't handle carries in the
+			inner loop In this case we fix the carry from the previous
+			column since the Montgomery reduction requires digits of the
+			result (so far) [see above] to work.
+
+			This is	handled by fixing up one carry after the inner loop.
+			The carry fixups are done in order so after these loops the
+			first m->used words of W[] have the carries fixed.
+		*/
+		for iy := 0; iy < n.used; iy += 1 {
+			W[ix + iy] += mu * _WORD(n.digit[iy])
+		}
+
+		/*
+			Now fix carry for next digit, W[ix+1].
+		*/
+		W[ix + 1] += (W[ix] >> _DIGIT_BITS)
+	}
+
+	/*
+		Now we have to propagate the carries and shift the words downward
+		[all those least significant digits we zeroed].
+	*/
+
+	for ; ix < n.used * 2; ix += 1 {
+		W[ix + 1] += (W[ix] >> _DIGIT_BITS)
+	}
+
+	/* copy out, A = A/b**n
+	 *
+	 * The result is A/b**n but instead of converting from an
+	 * array of mp_word to mp_digit than calling mp_rshd
+	 * we just copy them in the right order
+	 */
+
+	for ix = 0; ix < (n.used + 1); ix += 1 {
+		x.digit[ix] = DIGIT(W[n.used + ix] & _WORD(_MASK))
+	}
+
+	/*
+		Set the max used.
+	*/
+	x.used = n.used + 1
+
+	/*
+		Zero old_used digits, if the input a was larger than m->used+1 we'll have to clear the digits.
+	*/
+	internal_zero_unused(x, old_used)
+	internal_clamp(x)
+
+	/*
+		if A >= m then A = A - m
+	*/
+	if internal_gte_abs(x, n) {
+		return internal_sub(x, x, n)
+	}
+	return nil
+}
+
+/*
+	Computes xR**-1 == x (mod N) via Montgomery Reduction.
+	Assumes `x` and `n` not to be nil.
+*/
+_private_int_montgomery_reduce :: proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+	/*
+		Can the fast reduction [comba] method be used?
+		Note that unlike in mul, you're safely allowed *less* than the available columns [255 per default],
+		since carries are fixed up in the inner loop.
+	*/
+	internal_clear_if_uninitialized(x, n) or_return
+
+	digs := (n.used * 2) + 1
+	if digs < _WARRAY && x.used <= _WARRAY && n.used < _MAX_COMBA {
+		return _private_montgomery_reduce_comba(x, n, rho)
+	}
+
+	/*
+		Grow the input as required
+	*/
+	internal_grow(x, digs)                                           or_return
+	x.used = digs
+
+	for ix := 0; ix < n.used; ix += 1 {
+		/*
+			`mu = ai * rho mod b`
+			The value of rho must be precalculated via `int_montgomery_setup()`,
+			such that it equals -1/n0 mod b this allows the following inner loop
+			to reduce the input one digit at a time.
+		*/
+
+		mu := DIGIT((_WORD(x.digit[ix]) * _WORD(rho)) & _WORD(_MASK))
+
+		/*
+			a = a + mu * m * b**i
+			Multiply and add in place.
+		*/
+		u  := DIGIT(0)
+		iy := int(0)
+		for ; iy < n.used; iy += 1 {
+			/*
+				Compute product and sum.
+			*/
+			r := (_WORD(mu) * _WORD(n.digit[iy]) + _WORD(u) + _WORD(x.digit[ix + iy]))
+
+			/*
+				Get carry.
+			*/
+			u = DIGIT(r >> _DIGIT_BITS)
+
+			/*
+				Fix digit.
+			*/
+			x.digit[ix + iy] = DIGIT(r & _WORD(_MASK))
+		}
+
+		/*
+			At this point the ix'th digit of x should be zero.
+			Propagate carries upwards as required.
+		*/
+		for u != 0 {
+			x.digit[ix + iy] += u
+			u = x.digit[ix + iy] >> _DIGIT_BITS
+			x.digit[ix + iy] &= _MASK
+			iy += 1
+		}
+	}
+
+	/*
+		At this point the n.used'th least significant digits of x are all zero,
+		which means we can shift x to the right by n.used digits and the
+		residue is unchanged.
+
+		x = x/b**n.used.
+	*/
+	internal_clamp(x)
+	_private_int_shr_leg(x, n.used)
+
+	/*
+		if x >= n then x = x - n
+	*/
+	if internal_gte_abs(x, n) {
+		return internal_sub(x, x, n)
+	}
+
+	return nil
+}
+
+/*
+	Shifts with subtractions when the result is greater than b.
+
+	The method is slightly modified to shift B unconditionally upto just under
+	the leading bit of b.  This saves alot of multiple precision shifting.
+
+	Assumes `a` and `b` not to be `nil`.
+*/
+_private_int_montgomery_calc_normalization :: proc(a, b: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+	/*
+		How many bits of last digit does b use.
+	*/
+	internal_clear_if_uninitialized(a, b) or_return
+
+	bits := internal_count_bits(b) % _DIGIT_BITS
+
+	if b.used > 1 {
+		power := ((b.used - 1) * _DIGIT_BITS) + bits - 1
+		internal_int_power_of_two(a, power)                          or_return
+	} else {
+		internal_one(a)                                              or_return
+		bits = 1
+	}
+
+	/*
+		Now compute C = A * B mod b.
+	*/
+	for x := bits - 1; x < _DIGIT_BITS; x += 1 {
+		internal_int_shl1(a, a)                                      or_return
+		if internal_gte_abs(a, b) {
+			internal_sub(a, a, b)                                    or_return
+		}
+	}
+	return nil
+}
+
+/*
+	Sets up the Montgomery reduction stuff.
+*/
+_private_int_montgomery_setup :: proc(n: ^Int, allocator := context.allocator) -> (rho: DIGIT, err: Error) {
+	/*
+		Fast inversion mod 2**k
+		Based on the fact that:
+
+		XA = 1 (mod 2**n) => (X(2-XA)) A = 1 (mod 2**2n)
+		                  =>  2*X*A - X*X*A*A = 1
+		                  =>  2*(1) - (1)     = 1
+	*/
+	internal_clear_if_uninitialized(n, allocator) or_return
+
+	b := n.digit[0]
+	if b & 1 == 0 { return 0, .Invalid_Argument }
+
+	x := (((b + 2) & 4) << 1) + b /* here x*a==1 mod 2**4 */
+	x *= 2 - (b * x)              /* here x*a==1 mod 2**8 */
+	x *= 2 - (b * x)              /* here x*a==1 mod 2**16 */
+
+	when _DIGIT_TYPE_BITS == 64 {
+		x *= 2 - (b * x)              /* here x*a==1 mod 2**32 */
+		x *= 2 - (b * x)              /* here x*a==1 mod 2**64 */
+	}
+
+	/*
+		rho = -1/m mod b
+	*/
+	rho = DIGIT(((_WORD(1) << _WORD(_DIGIT_BITS)) - _WORD(x)) & _WORD(_MASK))
+	return rho, nil
+}
+
+/*
+	Reduces `x` mod `m`, assumes 0 < x < m**2, mu is precomputed via reduce_setup.
+	From HAC pp.604 Algorithm 14.42
+
+	Assumes `x`, `m` and `mu` all not to be `nil` and have been initialized.
+*/
+_private_int_reduce :: proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	q := &Int{}
+	defer internal_destroy(q)
+	um := m.used
+
+	/*
+		q = x
+	*/
+	internal_copy(q, x)                                              or_return
+
+	/*
+		q1 = x / b**(k-1)
+	*/
+	_private_int_shr_leg(q, um - 1)
+
+	/*
+		According to HAC this optimization is ok.
+	*/
+	if DIGIT(um) > DIGIT(1) << (_DIGIT_BITS - 1) {
+		internal_mul(q, q, mu)                                       or_return
+	} else {
+		_private_int_mul_high(q, q, mu, um)                          or_return
+	}
+
+	/*
+		q3 = q2 / b**(k+1)
+	*/
+	_private_int_shr_leg(q, um + 1)
+
+	/*
+		x = x mod b**(k+1), quick (no division)
+	*/
+	internal_int_mod_bits(x, x, _DIGIT_BITS * (um + 1))              or_return
+
+	/*
+		q = q * m mod b**(k+1), quick (no division)
+	*/
+	_private_int_mul(q, q, m, um + 1)                                or_return
+
+	/*
+		x = x - q
+	*/
+	internal_sub(x, x, q)                                            or_return
+
+	/*
+		If x < 0, add b**(k+1) to it.
+	*/
+	if internal_is_negative(x) {
+		internal_set(q, 1)                                           or_return
+		_private_int_shl_leg(q, um + 1)                                or_return
+		internal_add(x, x, q)                                        or_return
+	}
+
+	/*
+		Back off if it's too big.
+	*/
+	for internal_gte(x, m) {
+		internal_sub(x, x, m)                                        or_return
+	}
+
+	return nil
+}
+
+/*
+	Reduces `a` modulo `n`, where `n` is of the form 2**p - d.
+*/
+_private_int_reduce_2k :: proc(a, n: ^Int, d: DIGIT, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	q := &Int{}
+	defer internal_destroy(q)
+
+	internal_zero(q)                                                 or_return
+
+	p := internal_count_bits(n)
+
+	for {
+		/*
+			q = a/2**p, a = a mod 2**p
+		*/
+		internal_shrmod(q, a, a, p)                                  or_return
+
+		if d != 1 {
+			/*
+				q = q * d
+			*/
+			internal_mul(q, q, d)                                    or_return
+		}
+
+		/*
+			a = a + q
+		*/
+		internal_add(a, a, q)                                        or_return
+		if internal_lt_abs(a, n)                                     { break }
+		internal_sub(a, a, n)                                        or_return
+	}
+
+	return nil
+}
+
+/*
+	Reduces `a` modulo `n` where `n` is of the form 2**p - d
+	This differs from reduce_2k since "d" can be larger than a single digit.
+*/
+_private_int_reduce_2k_l :: proc(a, n, d: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	q := &Int{}
+	defer internal_destroy(q)
+
+	internal_zero(q)                                                 or_return
+
+	p := internal_count_bits(n)
+
+	for {
+		/*
+			q = a/2**p, a = a mod 2**p
+		*/
+		internal_shrmod(q, a, a, p)                                  or_return
+
+		/*
+			q = q * d
+		*/
+		internal_mul(q, q, d)                                        or_return
+
+		/*
+			a = a + q
+		*/
+		internal_add(a, a, q)                                        or_return
+		if internal_lt_abs(a, n)                                     { break }
+		internal_sub(a, a, n)                                        or_return
+	}
+
+	return nil
+}
+
+/*
+	Determines if `internal_int_reduce_2k` can be used.
+	Asssumes `a` not to be `nil` and to have been initialized.
+*/
+_private_int_reduce_is_2k :: proc(a: ^Int) -> (reducible: bool, err: Error) {
+	assert_if_nil(a)
+
+	if internal_is_zero(a) {
+		return false, nil
+	} else if a.used == 1 {
+		return true, nil
+	} else if a.used  > 1 {
+		iy := internal_count_bits(a)
+		iw := 1
+		iz := DIGIT(1)
+
+		/*
+			Test every bit from the second digit up, must be 1.
+		*/
+		for ix := _DIGIT_BITS; ix < iy; ix += 1 {
+			if a.digit[iw] & iz == 0 {
+				return false, nil
+			}
+
+			iz <<= 1
+			if iz > _DIGIT_MAX {
+				iw += 1
+				iz  = 1
+			}
+		}
+		return true, nil
+	} else {
+		return true, nil
+	}
+}
+
+/*
+	Determines if `internal_int_reduce_2k_l` can be used.
+	Asssumes `a` not to be `nil` and to have been initialized.
+*/
+_private_int_reduce_is_2k_l :: proc(a: ^Int) -> (reducible: bool, err: Error) {
+	assert_if_nil(a)
+
+	if internal_int_is_zero(a) {
+		return false, nil
+	} else if a.used == 1 {
+		return true, nil
+	} else if a.used  > 1 {
+		/*
+			If more than half of the digits are -1 we're sold.
+		*/
+		ix := 0
+		iy := 0
+
+		for ; ix < a.used; ix += 1 {
+			if a.digit[ix] == _DIGIT_MAX {
+				iy += 1
+			}
+		}
+		return iy >= (a.used / 2), nil
+	} else {
+		return false, nil
+	}
+}
+
+/*
+	Determines the setup value.
+	Assumes `a` is not `nil`.
+*/
+_private_int_reduce_2k_setup :: proc(a: ^Int, allocator := context.allocator) -> (d: DIGIT, err: Error) {
+	context.allocator = allocator
+
+	tmp := &Int{}
+	defer internal_destroy(tmp)
+	internal_zero(tmp)                                               or_return
+
+	internal_int_power_of_two(tmp, internal_count_bits(a))           or_return
+	internal_sub(tmp, tmp, a)                                        or_return
+
+	return tmp.digit[0], nil
+}
+
+/*
+	Determines the setup value.
+	Assumes `mu` and `P` are not `nil`.
+
+	d := (1 << a.bits) - a;
+*/
+_private_int_reduce_2k_setup_l :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	tmp := &Int{}
+	defer internal_destroy(tmp)
+	internal_zero(tmp)                                               or_return
+
+	internal_int_power_of_two(tmp, internal_count_bits(P))           or_return
+	internal_sub(mu, tmp, P)                                         or_return
+
+	return nil
+}
+
+/*
+	Pre-calculate the value required for Barrett reduction.
+	For a given modulus "P" it calulates the value required in "mu"
+	Assumes `mu` and `P` are not `nil`.
+*/
+_private_int_reduce_setup :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	internal_int_power_of_two(mu, P.used * 2 * _DIGIT_BITS)           or_return
+	return internal_int_div(mu, mu, P)
+}
+
+/*
+	Determines the setup value.
+	Assumes `a` to not be `nil` and to have been initialized.
+*/
+_private_int_dr_setup :: proc(a: ^Int) -> (d: DIGIT) {
+	/*
+		The casts are required if _DIGIT_BITS is one less than
+		the number of bits in a DIGIT [e.g. _DIGIT_BITS==31].
+	*/
+	return DIGIT((1 << _DIGIT_BITS) - a.digit[0])
+}
+
+/*
+	Determines if a number is a valid DR modulus.
+	Assumes `a` to not be `nil` and to have been initialized.
+*/
+_private_dr_is_modulus :: proc(a: ^Int) -> (res: bool) {
+	/*
+		Must be at least two digits.
+	*/
+	if a.used < 2 { return false }
+
+	/*
+		Must be of the form b**k - a [a <= b] so all but the first digit must be equal to -1 (mod b).
+	*/
+	for ix := 1; ix < a.used; ix += 1 {
+		if a.digit[ix] != _MASK {
+			return false
+		}
+	}
+	return true
+}
+
+/*
+	Reduce "x" in place modulo "n" using the Diminished Radix algorithm.
+	Based on algorithm from the paper
+
+		"Generating Efficient Primes for Discrete Log Cryptosystems"
+					Chae Hoon Lim, Pil Joong Lee,
+			POSTECH Information Research Laboratories
+
+	The modulus must be of a special format [see manual].
+	Has been modified to use algorithm 7.10 from the LTM book instead
+
+	Input x must be in the range 0 <= x <= (n-1)**2
+	Assumes `x` and `n` to not be `nil` and to have been initialized.
+*/
+_private_int_dr_reduce :: proc(x, n: ^Int, k: DIGIT, allocator := context.allocator) -> (err: Error) {
+	/*
+		m = digits in modulus.
+	*/
+	m := n.used
+
+	/*
+		Ensure that "x" has at least 2m digits.
+	*/
+	internal_grow(x, m + m)                                          or_return
+
+	/*
+		Top of loop, this is where the code resumes if another reduction pass is required.
+	*/
+	for {
+		i: int
+		mu := DIGIT(0)
+
+		/*
+			Compute (x mod B**m) + k * [x/B**m] inline and inplace.
+		*/
+		for i = 0; i < m; i += 1 {
+			r         := _WORD(x.digit[i + m]) * _WORD(k) + _WORD(x.digit[i] + mu)
+			x.digit[i] = DIGIT(r & _WORD(_MASK))
+			mu         = DIGIT(r >> _WORD(_DIGIT_BITS))
+		}
+
+		/*
+			Set final carry.
+		*/
+		x.digit[i] = mu
+
+		/*
+			Zero words above m.
+		*/
+		mem.zero_slice(x.digit[m + 1:][:x.used - m])
+
+		/*
+			Clamp, sub and return.
+		*/
+		internal_clamp(x)                                            or_return
+
+		/*
+			If x >= n then subtract and reduce again.
+			Each successive "recursion" makes the input smaller and smaller.
+		*/
+		if internal_lt_abs(x, n) { break }
+
+		internal_sub(x, x, n)                                        or_return
+	}
+	return nil
+}
+
+/*
+	Computes res == G**X mod P.
+	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
+*/
+_private_int_exponent_mod :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	M := [_TAB_SIZE]Int{}
+	winsize: uint
+
+	/*
+		Use a pointer to the reduction algorithm.
+		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
+	*/
+	redux: #type proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error)
+
+	defer {
+		internal_destroy(&M[1])
+		for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+			internal_destroy(&M[x])
+		}
+	}
+
+	/*
+		Find window size.
+	*/
+	x := internal_count_bits(X)
+	switch {
+	case x <= 7:
+		winsize = 2
+	case x <= 36:
+		winsize = 3
+	case x <= 140:
+		winsize = 4
+	case x <= 450:
+		winsize = 5
+	case x <= 1303:
+		winsize = 6
+	case x <= 3529:
+		winsize = 7
+	case:
+		winsize = 8
+	}
+
+	winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize
+
+	/*
+		Init M array.
+		Init first cell.
+	*/
+	internal_zero(&M[1])                                             or_return
+
+	/*
+		Now init the second half of the array.
+	*/
+	for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+		internal_zero(&M[x])                                         or_return
+	}
+
+	/*
+		Create `mu`, used for Barrett reduction.
+	*/
+	mu := &Int{}
+	defer internal_destroy(mu)
+	internal_zero(mu)                                                or_return
+
+	if redmode == 0 {
+		_private_int_reduce_setup(mu, P)                             or_return
+		redux = _private_int_reduce
+	} else {
+		_private_int_reduce_2k_setup_l(mu, P)                        or_return
+		redux = _private_int_reduce_2k_l
+	}
+
+	/*
+		Create M table.
+
+		The M table contains powers of the base, e.g. M[x] = G**x mod P.
+		The first half of the table is not computed, though, except for M[0] and M[1].
+	*/
+	internal_int_mod(&M[1], G, P)                                    or_return
+
+	/*
+		Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
+
+		TODO: This can probably be replaced by computing the power and using `pow` to raise to it
+		instead of repeated squaring.
+	*/
+	slot := 1 << (winsize - 1)
+	internal_copy(&M[slot], &M[1])                                   or_return
+
+	for x = 0; x < int(winsize - 1); x += 1 {
+		/*
+			Square it.
+		*/
+		internal_sqr(&M[slot], &M[slot])                             or_return
+
+		/*
+			Reduce modulo P
+		*/
+		redux(&M[slot], P, mu)                                       or_return
+	}
+
+	/*
+		Create upper table, that is M[x] = M[x-1] * M[1] (mod P)
+		for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
+	*/
+	for x = slot + 1; x < (1 << winsize); x += 1 {
+		internal_mul(&M[x], &M[x - 1], &M[1])                        or_return
+		redux(&M[x], P, mu)                                          or_return
+	}
+
+	/*
+		Setup result.
+	*/
+	internal_one(res)                                                or_return
+
+	/*
+		Set initial mode and bit cnt.
+	*/
+	mode   := 0
+	bitcnt := 1
+	buf    := DIGIT(0)
+	digidx := X.used - 1
+	bitcpy := uint(0)
+	bitbuf := DIGIT(0)
+
+	for {
+		/*
+			Grab next digit as required.
+		*/
+		bitcnt -= 1
+		if bitcnt == 0 {
+			/*
+				If digidx == -1 we are out of digits.
+			*/
+			if digidx == -1 { break }
+
+			/*
+				Read next digit and reset the bitcnt.
+			*/
+			buf    = X.digit[digidx]
+			digidx -= 1
+			bitcnt = _DIGIT_BITS
+		}
+
+		/*
+			Grab the next msb from the exponent.
+		*/
+		y := buf >> (_DIGIT_BITS - 1) & 1
+		buf <<= 1
+
+		/*
+			If the bit is zero and mode == 0 then we ignore it.
+			These represent the leading zero bits before the first 1 bit
+			in the exponent.  Technically this opt is not required but it
+			does lower the # of trivial squaring/reductions used.
+		*/
+		if mode == 0 && y == 0 {
+			continue
+		}
+
+		/*
+			If the bit is zero and mode == 1 then we square.
+		*/
+		if mode == 1 && y == 0 {
+			internal_sqr(res, res)                                   or_return
+			redux(res, P, mu)                                        or_return
+			continue
+		}
+
+		/*
+			Else we add it to the window.
+		*/
+		bitcpy += 1
+		bitbuf |= (y << (winsize - bitcpy))
+		mode    = 2
+
+		if (bitcpy == winsize) {
+			/*
+				Window is filled so square as required and multiply.
+				Square first.
+			*/
+			for x = 0; x < int(winsize); x += 1 {
+				internal_sqr(res, res)                               or_return
+				redux(res, P, mu)                                    or_return
+			}
+
+			/*
+				Then multiply.
+			*/
+			internal_mul(res, res, &M[bitbuf])                       or_return
+			redux(res, P, mu)                                        or_return
+
+			/*
+				Empty window and reset.
+			*/
+			bitcpy = 0
+			bitbuf = 0
+			mode   = 1
+		}
+	}
+
+	/*
+		If bits remain then square/multiply.
+	*/
+	if mode == 2 && bitcpy > 0 {
+		/*
+			Square then multiply if the bit is set.
+		*/
+		for x = 0; x < int(bitcpy); x += 1 {
+			internal_sqr(res, res)                                   or_return
+			redux(res, P, mu)                                        or_return
+
+			bitbuf <<= 1
+			if ((bitbuf & (1 << winsize)) != 0) {
+				/*
+					Then multiply.
+				*/
+				internal_mul(res, res, &M[1])                        or_return
+				redux(res, P, mu)                                    or_return
+			}
+		}
+	}
+	return err
+}
+
+/*
+	Computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
+
+	Uses a left-to-right `k`-ary sliding window to compute the modular exponentiation.
+	The value of `k` changes based on the size of the exponent.
+
+	Uses Montgomery or Diminished Radix reduction [whichever appropriate]
+
+	Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
+*/
+_private_int_exponent_mod_fast :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	M := [_TAB_SIZE]Int{}
+	winsize: uint
+
+	/*
+		Use a pointer to the reduction algorithm.
+		This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
+	*/
+	redux: #type proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error)
+
+	defer {
+		internal_destroy(&M[1])
+		for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+			internal_destroy(&M[x])
+		}
+	}
+
+	/*
+		Find window size.
+	*/
+	x := internal_count_bits(X)
+	switch {
+	case x <= 7:
+		winsize = 2
+	case x <= 36:
+		winsize = 3
+	case x <= 140:
+		winsize = 4
+	case x <= 450:
+		winsize = 5
+	case x <= 1303:
+		winsize = 6
+	case x <= 3529:
+		winsize = 7
+	case:
+		winsize = 8
+	}
+
+	winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize
+
+	/*
+		Init M array
+		Init first cell.
+	*/
+	cap := internal_int_allocated_cap(P)
+	internal_grow(&M[1], cap)                                        or_return
+
+	/*
+		Now init the second half of the array.
+	*/
+	for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
+		internal_grow(&M[x], cap)                                    or_return
+	}
+
+	/*
+		Determine and setup reduction code.
+	*/
+	rho: DIGIT
+
+	if redmode == 0 {
+		/*
+			Now setup Montgomery.
+		*/
+		rho = _private_int_montgomery_setup(P)                       or_return
+
+		/*
+			Automatically pick the comba one if available (saves quite a few calls/ifs).
+		*/
+		if ((P.used * 2) + 1) < _WARRAY && P.used < _MAX_COMBA {
+			redux = _private_montgomery_reduce_comba
+		} else {
+			/*
+				Use slower baseline Montgomery method.
+			*/
+			redux = _private_int_montgomery_reduce
+		}
+	} else if redmode == 1 {
+		/*
+			Setup DR reduction for moduli of the form B**k - b.
+		*/
+		rho = _private_int_dr_setup(P)
+		redux = _private_int_dr_reduce
+	} else {
+		/*
+			Setup DR reduction for moduli of the form 2**k - b.
+		*/
+		rho = _private_int_reduce_2k_setup(P)                        or_return
+		redux = _private_int_reduce_2k
+	}
+
+	/*
+		Setup result.
+	*/
+	internal_grow(res, cap)                                          or_return
+
+	/*
+		Create M table
+		The first half of the table is not computed, though, except for M[0] and M[1]
+	*/
+
+	if redmode == 0 {
+		/*
+			Now we need R mod m.
+		*/
+		_private_int_montgomery_calc_normalization(res, P)           or_return
+
+		/*
+			Now set M[1] to G * R mod m.
+		*/
+		internal_mulmod(&M[1], G, res, P)                            or_return
+	} else {
+		internal_one(res)                                            or_return
+		internal_mod(&M[1], G, P)                                    or_return
+	}
+
+	/*
+		Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
+	*/
+	slot := 1 << (winsize - 1)
+	internal_copy(&M[slot], &M[1])                                   or_return
+
+	for x = 0; x < int(winsize - 1); x += 1 {
+		internal_sqr(&M[slot], &M[slot])                             or_return
+		redux(&M[slot], P, rho)                                      or_return
+	}
+
+	/*
+		Create upper table.
+	*/
+	for x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x += 1 {
+		internal_mul(&M[x], &M[x - 1], &M[1])                        or_return
+		redux(&M[x], P, rho)                                         or_return
+	}
+
+	/*
+		Set initial mode and bit cnt.
+	*/
+	mode   := 0
+	bitcnt := 1
+	buf    := DIGIT(0)
+	digidx := X.used - 1
+	bitcpy := 0
+	bitbuf := DIGIT(0)
+
+	for {
+		/*
+			Grab next digit as required.
+		*/
+		bitcnt -= 1
+		if bitcnt == 0 {
+			/*
+				If digidx == -1 we are out of digits so break.
+			*/
+			if digidx == -1 { break }
+
+			/*
+				Read next digit and reset the bitcnt.
+			*/
+			buf    = X.digit[digidx]
+			digidx -= 1
+			bitcnt = _DIGIT_BITS
+		}
+
+		/*
+			Grab the next msb from the exponent.
+		*/
+		y := (buf >> (_DIGIT_BITS - 1)) & 1
+		buf <<= 1
+
+		/*
+			If the bit is zero and mode == 0 then we ignore it.
+			These represent the leading zero bits before the first 1 bit in the exponent.
+			Technically this opt is not required but it does lower the # of trivial squaring/reductions used.
+		*/
+		if mode == 0 && y == 0 { continue }
+
+		/*
+			If the bit is zero and mode == 1 then we square.
+		*/
+		if mode == 1 && y == 0 {
+			internal_sqr(res, res)                                   or_return
+			redux(res, P, rho)                                       or_return
+			continue
+		}
+
+		/*
+			Else we add it to the window.
+		*/
+		bitcpy += 1
+		bitbuf |= (y << (winsize - uint(bitcpy)))
+		mode    = 2
+
+		if bitcpy == int(winsize) {
+			/*
+				Window is filled so square as required and multiply
+				Square first.
+			*/
+			for x = 0; x < int(winsize); x += 1 {
+				internal_sqr(res, res)                               or_return
+				redux(res, P, rho)                                   or_return
+			}
+
+			/*
+				Then multiply.
+			*/
+			internal_mul(res, res, &M[bitbuf])                       or_return
+			redux(res, P, rho)                                       or_return
+
+			/*
+				Empty window and reset.
+			*/
+			bitcpy = 0
+			bitbuf = 0
+			mode   = 1
+		}
+	}
+
+	/*
+		If bits remain then square/multiply.
+	*/
+	if mode == 2 && bitcpy > 0 {
+		/*
+			Square then multiply if the bit is set.
+		*/
+		for x = 0; x < bitcpy; x += 1 {
+			internal_sqr(res, res)                                   or_return
+			redux(res, P, rho)                                       or_return
+
+			/*
+				Get next bit of the window.
+			*/
+			bitbuf <<= 1
+			if bitbuf & (1 << winsize) != 0 {
+				/*
+					Then multiply.
+				*/
+				internal_mul(res, res, &M[1])                        or_return
+				redux(res, P, rho)                                   or_return
+			}
+		}
+	}
+
+	if redmode == 0 {
+		/*
+			Fixup result if Montgomery reduction is used.
+			Recall that any value in a Montgomery system is actually multiplied by R mod n.
+			So we have to reduce one more time to cancel out the factor of R.
+		*/
+		redux(res, P, rho)                                           or_return
+	}
+
+	return nil
+}
+
+/*
+	hac 14.61, pp608
+*/
+_private_inverse_modulo :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+	x, y, u, v, A, B, C, D := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
+	defer internal_destroy(x, y, u, v, A, B, C, D)
+
+	// `b` cannot be negative.
+	if b.sign == .Negative || internal_is_zero(b) {
+		return .Invalid_Argument
+	}
+
+	// init temps.
+	internal_init_multi(x, y, u, v, A, B, C, D) or_return
+
+	// `x` = `a` % `b`, `y` = `b`
+	internal_mod(x, a, b) or_return
+	internal_copy(y, b) or_return
+
+	// 2. [modified] if x,y are both even then return an error!
+	if internal_is_even(x) && internal_is_even(y) {
+		return .Invalid_Argument
+	}
+
+	// 3. u=x, v=y, A=1, B=0, C=0, D=1
+	internal_copy(u, x) or_return
+	internal_copy(v, y) or_return
+	internal_one(A) or_return
+	internal_one(D) or_return
+
+	for {
+		// 4.  while `u` is even do:
+		for internal_is_even(u) {
+			// 4.1 `u` = `u` / 2
+			internal_int_shr1(u, u) or_return
+
+			// 4.2 if `A` or `B` is odd then:
+			if internal_is_odd(A) || internal_is_odd(B) {
+				// `A` = (`A`+`y`) / 2, `B` = (`B`-`x`) / 2
+				internal_add(A, A, y) or_return
+				internal_sub(B, B, x) or_return
+			}
+			// `A` = `A` / 2, `B` = `B` / 2
+			internal_int_shr1(A, A) or_return
+			internal_int_shr1(B, B) or_return
+		}
+
+		// 5.  while `v` is even do:
+		for internal_is_even(v) {
+			// 5.1 `v` = `v` / 2
+			internal_int_shr1(v, v) or_return
+
+			// 5.2 if `C` or `D` is odd then:
+			if internal_is_odd(C) || internal_is_odd(D) {
+				// `C` = (`C`+`y`) / 2, `D` = (`D`-`x`) / 2
+				internal_add(C, C, y) or_return
+				internal_sub(D, D, x) or_return
+			}
+			// `C` = `C` / 2, `D` = `D` / 2
+			internal_int_shr1(C, C) or_return
+			internal_int_shr1(D, D) or_return
+		}
+
+		// 6.  if `u` >= `v` then:
+		if internal_cmp(u, v) != -1 {
+			// `u` = `u` - `v`, `A` = `A` - `C`, `B` = `B` - `D`
+			internal_sub(u, u, v) or_return
+			internal_sub(A, A, C) or_return
+			internal_sub(B, B, D) or_return
+		} else {
+			// v - v - u, C = C - A, D = D - B
+			internal_sub(v, v, u) or_return
+			internal_sub(C, C, A) or_return
+			internal_sub(D, D, B) or_return
+		}
+
+		// If not zero goto step 4
+		if internal_is_zero(u) {
+			break
+		}
+	}
+
+	// Now `a` = `C`, `b` = `D`, `gcd` == `g`*`v`
+
+	// If `v` != `1` then there is no inverse.
+	if !internal_eq(v, 1) {
+		return .Invalid_Argument
+	}
+
+	// If its too low.
+	for internal_is_negative(C) {
+		internal_add(C, C, b) or_return
+	}
+
+	// Too big.
+	for internal_cmp_mag(C, b) > -1 {
+		internal_sub(C, C, b) or_return
+	}
+
+	// `C` is now the inverse.
+	swap(dest, C)
+	return
+}
+
+/*
+	Computes the modular inverse via binary extended Euclidean algorithm, that is `dest` = 1 / `a` mod `b`.
+
+	Based on slow invmod except this is optimized for the case where `b` is odd,
+	as per HAC Note 14.64 on pp. 610.
+*/
+_private_inverse_modulo_odd :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+	x, y, u, v, B, D := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
+	defer internal_destroy(x, y, u, v, B, D)
+
+	sign: Sign
+
+	/*
+		2. [modified] `b` must be odd.
+	*/
+	if internal_is_even(b) { return .Invalid_Argument }
+
+	/*
+		Init all our temps.
+	*/
+	internal_init_multi(x, y, u, v, B, D) or_return
+
+	/*
+		`x` == modulus, `y` == value to invert.
+	*/
+	internal_copy(x, b) or_return
+
+	/*
+		We need `y` = `|a|`.
+	*/
+	internal_mod(y, a, b) or_return
+
+	/*
+		If one of `x`, `y` is zero return an error!
+	*/
+	if internal_is_zero(x) || internal_is_zero(y) { return .Invalid_Argument }
+
+	/*
+		3. `u` = `x`, `v` = `y`, `A` = 1, `B` = 0, `C` = 0, `D` = 1
+	*/
+	internal_copy(u, x) or_return
+	internal_copy(v, y) or_return
+
+	internal_one(D) or_return
+
+	for {
+		/*
+			4.  while `u` is even do.
+		*/
+		for internal_is_even(u) {
+			/*
+				4.1 `u` = `u` / 2
+			*/
+			internal_int_shr1(u, u) or_return
+
+			/*
+				4.2 if `B` is odd then:
+			*/
+			if internal_is_odd(B) {
+				/*
+					`B` = (`B` - `x`) / 2
+				*/
+				internal_sub(B, B, x) or_return
+			}
+
+			/*
+				`B` = `B` / 2
+			*/
+			internal_int_shr1(B, B) or_return
+		}
+
+		/*
+			5.  while `v` is even do:
+		*/
+		for internal_is_even(v) {
+			/*
+				5.1 `v` = `v` / 2
+			*/
+			internal_int_shr1(v, v) or_return
+
+			/*
+				5.2 if `D` is odd then:
+			*/
+			if internal_is_odd(D) {
+				/*
+					`D` = (`D` - `x`) / 2
+				*/
+				internal_sub(D, D, x) or_return
+			}
+			/*
+				`D` = `D` / 2
+			*/
+			internal_int_shr1(D, D) or_return
+		}
+
+		/*
+			6.  if `u` >= `v` then:
+		*/
+		if internal_cmp(u, v) != -1 {
+			/*
+				`u` = `u` - `v`, `B` = `B` - `D`
+			*/
+			internal_sub(u, u, v) or_return
+			internal_sub(B, B, D) or_return
+		} else {
+			/*
+				`v` - `v` - `u`, `D` = `D` - `B`
+			*/
+			internal_sub(v, v, u) or_return
+			internal_sub(D, D, B) or_return
+		}
+
+		/*
+			If not zero goto step 4.
+		*/
+		if internal_is_zero(u) { break }
+	}
+
+	/*
+		Now `a` = C, `b` = D, gcd == g*v
+	*/
+
+	/*
+		if `v` != 1 then there is no inverse
+	*/
+	if internal_cmp(v, 1) != 0 {
+		return .Invalid_Argument
+	}
+
+	/*
+		`b` is now the inverse.
+	*/
+	sign = a.sign
+	for internal_int_is_negative(D) {
+		internal_add(D, D, b) or_return
+	}
+
+	/*
+		Too big.
+	*/
+	for internal_gte_abs(D, b) {
+		internal_sub(D, D, b) or_return
+	}
+
+	swap(dest, D)
+	dest.sign = sign
+	return nil
+}
+
+
+/*
+	Returns the log2 of an `Int`.
+	Assumes `a` not to be `nil` and to have been initialized.
+	Also assumes `base` is a power of two.
+*/
+_private_log_power_of_two :: proc(a: ^Int, base: DIGIT) -> (log: int, err: Error) {
+	base := base
+	y: int
+	for y = 0; base & 1 == 0; {
+		y += 1
+		base >>= 1
+	}
+	log = internal_count_bits(a)
+	return (log - 1) / y, err
+}
+
+/*
+	Copies DIGITs from `src` to `dest`.
+	Assumes `src` and `dest` to not be `nil` and have been initialized.
+*/
+_private_copy_digits :: proc(dest, src: ^Int, digits: int, offset := int(0)) -> (err: Error) {
+	digits := digits
+	/*
+		If dest == src, do nothing
+	*/
+	if dest == src {
+		return nil
+	}
+
+	digits = min(digits, len(src.digit), len(dest.digit))
+	mem.copy_non_overlapping(&dest.digit[0], &src.digit[offset], size_of(DIGIT) * digits)
+	return nil
+}
+
+
+/*
+	Shift left by `digits` * _DIGIT_BITS bits.
+*/
+_private_int_shl_leg :: proc(quotient: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	if digits <= 0 { return nil }
+
+	/*
+		No need to shift a zero.
+	*/
+	if #force_inline internal_is_zero(quotient) {
+		return nil
+	}
+
+	/*
+		Resize `quotient` to accomodate extra digits.
+	*/
+	#force_inline internal_grow(quotient, quotient.used + digits) or_return
+
+	/*
+		Increment the used by the shift amount then copy upwards.
+	*/
+
+	/*
+		Much like `_private_int_shr_leg`, this is implemented using a sliding window,
+		except the window goes the other way around.
+	*/
+	#no_bounds_check for x := quotient.used; x > 0; x -= 1 {
+		quotient.digit[x+digits-1] = quotient.digit[x-1]
+	}
+
+	quotient.used += digits
+	mem.zero_slice(quotient.digit[:digits])
+	return nil
+}
+
+/*
+	Shift right by `digits` * _DIGIT_BITS bits.
+*/
+_private_int_shr_leg :: proc(quotient: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
+	context.allocator = allocator
+
+	if digits <= 0 { return nil }
+
+	/*
+		If digits > used simply zero and return.
+	*/
+	if digits > quotient.used { return internal_zero(quotient) }
+
+	/*
+		Much like `int_shl_digit`, this is implemented using a sliding window,
+		except the window goes the other way around.
+
+		b-2 | b-1 | b0 | b1 | b2 | ... | bb |   ---->
+					/\                   |      ---->
+					 \-------------------/      ---->
+	*/
+
+	#no_bounds_check for x := 0; x < (quotient.used - digits); x += 1 {
+		quotient.digit[x] = quotient.digit[x + digits]
+	}
+	quotient.used -= digits
+	internal_zero_unused(quotient)
+	return internal_clamp(quotient)
+}
+
+/*	
+	========================    End of private procedures    =======================
+
+	===============================  Private tables  ===============================
+
+	Tables used by `internal_*` and `_*`.
+*/
+
+_private_int_rem_128 := [?]DIGIT{
+	0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+	0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+	1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+	1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+	0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+	1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+	1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+	1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+}
+#assert(128 * size_of(DIGIT) == size_of(_private_int_rem_128))
+
+_private_int_rem_105 := [?]DIGIT{
+	0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
+	0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
+	0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
+	1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+	0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
+	1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
+	1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
+}
+#assert(105 * size_of(DIGIT) == size_of(_private_int_rem_105))
+
+_PRIME_TAB_SIZE :: 256
+_private_prime_table := [_PRIME_TAB_SIZE]DIGIT{
+	0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
+	0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
+	0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
+	0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083,
+	0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
+	0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
+	0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
+	0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
+
+	0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
+	0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
+	0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
+	0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
+	0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
+	0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
+	0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
+	0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
+
+	0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
+	0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
+	0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
+	0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
+	0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
+	0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
+	0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
+	0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
+
+	0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
+	0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
+	0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
+	0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
+	0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
+	0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
+	0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
+	0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653,
+}
+#assert(_PRIME_TAB_SIZE * size_of(DIGIT) == size_of(_private_prime_table))
+
+when MATH_BIG_FORCE_64_BIT || (!MATH_BIG_FORCE_32_BIT && size_of(rawptr) == 8) {
+	_factorial_table := [35]_WORD{
+/* f(00): */                                                     1,
+/* f(01): */                                                     1,
+/* f(02): */                                                     2,
+/* f(03): */                                                     6,
+/* f(04): */                                                    24,
+/* f(05): */                                                   120,
+/* f(06): */                                                   720,
+/* f(07): */                                                 5_040,
+/* f(08): */                                                40_320,
+/* f(09): */                                               362_880,
+/* f(10): */                                             3_628_800,
+/* f(11): */                                            39_916_800,
+/* f(12): */                                           479_001_600,
+/* f(13): */                                         6_227_020_800,
+/* f(14): */                                        87_178_291_200,
+/* f(15): */                                     1_307_674_368_000,
+/* f(16): */                                    20_922_789_888_000,
+/* f(17): */                                   355_687_428_096_000,
+/* f(18): */                                 6_402_373_705_728_000,
+/* f(19): */                               121_645_100_408_832_000,
+/* f(20): */                             2_432_902_008_176_640_000,
+/* f(21): */                            51_090_942_171_709_440_000,
+/* f(22): */                         1_124_000_727_777_607_680_000,
+/* f(23): */                        25_852_016_738_884_976_640_000,
+/* f(24): */                       620_448_401_733_239_439_360_000,
+/* f(25): */                    15_511_210_043_330_985_984_000_000,
+/* f(26): */                   403_291_461_126_605_635_584_000_000,
+/* f(27): */                10_888_869_450_418_352_160_768_000_000,
+/* f(28): */               304_888_344_611_713_860_501_504_000_000,
+/* f(29): */             8_841_761_993_739_701_954_543_616_000_000,
+/* f(30): */           265_252_859_812_191_058_636_308_480_000_000,
+/* f(31): */         8_222_838_654_177_922_817_725_562_880_000_000,
+/* f(32): */       263_130_836_933_693_530_167_218_012_160_000_000,
+/* f(33): */     8_683_317_618_811_886_495_518_194_401_280_000_000,
+/* f(34): */   295_232_799_039_604_140_847_618_609_643_520_000_000,
+	}
+} else {
+	_factorial_table := [21]_WORD{
+/* f(00): */                                                     1,
+/* f(01): */                                                     1,
+/* f(02): */                                                     2,
+/* f(03): */                                                     6,
+/* f(04): */                                                    24,
+/* f(05): */                                                   120,
+/* f(06): */                                                   720,
+/* f(07): */                                                 5_040,
+/* f(08): */                                                40_320,
+/* f(09): */                                               362_880,
+/* f(10): */                                             3_628_800,
+/* f(11): */                                            39_916_800,
+/* f(12): */                                           479_001_600,
+/* f(13): */                                         6_227_020_800,
+/* f(14): */                                        87_178_291_200,
+/* f(15): */                                     1_307_674_368_000,
+/* f(16): */                                    20_922_789_888_000,
+/* f(17): */                                   355_687_428_096_000,
+/* f(18): */                                 6_402_373_705_728_000,
+/* f(19): */                               121_645_100_408_832_000,
+/* f(20): */                             2_432_902_008_176_640_000,
+	}
+}
+
+/*
+	=========================  End of private tables  ========================
 */
\ No newline at end of file
diff --git a/core/os/os_haiku.odin b/core/os/os_haiku.odin
new file mode 100644
index 000000000..06052fc42
--- /dev/null
+++ b/core/os/os_haiku.odin
@@ -0,0 +1,435 @@
+package os
+
+foreign import libc "system:c"
+
+import "base:runtime"
+import "core:c"
+import "core:strings"
+import "core:sys/haiku"
+
+Handle    :: i32
+Pid       :: i32
+File_Time :: i64
+Errno     :: i32
+
+MAX_PATH :: haiku.PATH_MAX
+
+ENOSYS :: int(haiku.Errno.POSIX_ERROR_BASE) + 9
+
+INVALID_HANDLE :: ~Handle(0)
+
+ERROR_NONE: Errno: 0
+
+stdin:  Handle = 0
+stdout: Handle = 1
+stderr: Handle = 2
+
+pid_t     :: haiku.pid_t
+off_t     :: haiku.off_t
+dev_t     :: haiku.dev_t
+ino_t     :: haiku.ino_t
+mode_t    :: haiku.mode_t
+nlink_t   :: haiku.nlink_t
+uid_t     :: haiku.uid_t
+gid_t     :: haiku.gid_t
+blksize_t :: haiku.blksize_t
+blkcnt_t  :: haiku.blkcnt_t
+time_t    :: haiku.time_t
+
+
+Unix_File_Time :: struct {
+	seconds:     time_t,
+	nanoseconds: c.long,
+}
+
+OS_Stat :: struct {
+	device_id: dev_t,		// device ID that this file resides on
+	serial: ino_t,			// this file's serial inode ID
+	mode: mode_t,			// file mode (rwx for user, group, etc)
+	nlink: nlink_t,			// number of hard links to this file
+	uid: uid_t,			// user ID of the file's owner
+	gid: gid_t,			// group ID of the file's group
+	size: off_t,			// file size, in bytes
+	rdev: dev_t,			// device type (not used)
+	block_size:	blksize_t,	// optimal blocksize for I/O
+	
+	last_access: Unix_File_Time,	// time of last access
+	modified: Unix_File_Time,	// time of last data modification
+	status_change: Unix_File_Time,	// time of last file status change
+	birthtime:	Unix_File_Time,	// time of file creation
+
+	type: u32,                      // attribute/index type
+
+	blocks: blkcnt_t,		// blocks allocated for file
+}
+
+/* file access modes for open() */
+O_RDONLY         :: 0x0000		/* read only */
+O_WRONLY         :: 0x0001		/* write only */
+O_RDWR           :: 0x0002		/* read and write */
+O_ACCMODE        :: 0x0003		/* mask to get the access modes above */
+O_RWMASK         :: O_ACCMODE
+
+/* flags for open() */
+O_EXCL           :: 0x0100		/* exclusive creat */
+O_CREATE         :: 0x0200		/* create and open file */
+O_TRUNC          :: 0x0400		/* open with truncation */
+O_NOCTTY         :: 0x1000		/* don't make tty the controlling tty */
+O_NOTRAVERSE     :: 0x2000		/* do not traverse leaf link */
+
+// File type
+S_IFMT   :: 0o170000 // Type of file mask
+S_IFIFO  :: 0o010000 // Named pipe (fifo)
+S_IFCHR  :: 0o020000 // Character special
+S_IFDIR  :: 0o040000 // Directory
+S_IFBLK  :: 0o060000 // Block special
+S_IFREG  :: 0o100000 // Regular
+S_IFLNK  :: 0o120000 // Symbolic link
+S_IFSOCK :: 0o140000 // Socket
+S_ISVTX  :: 0o001000 // Save swapped text even after use
+
+// File mode
+	// Read, write, execute/search by owner
+S_IRWXU :: 0o0700 // RWX mask for owner
+S_IRUSR :: 0o0400 // R for owner
+S_IWUSR :: 0o0200 // W for owner
+S_IXUSR :: 0o0100 // X for owner
+
+	// Read, write, execute/search by group
+S_IRWXG :: 0o0070 // RWX mask for group
+S_IRGRP :: 0o0040 // R for group
+S_IWGRP :: 0o0020 // W for group
+S_IXGRP :: 0o0010 // X for group
+
+	// Read, write, execute/search by others
+S_IRWXO :: 0o0007 // RWX mask for other
+S_IROTH :: 0o0004 // R for other
+S_IWOTH :: 0o0002 // W for other
+S_IXOTH :: 0o0001 // X for other
+
+S_ISUID :: 0o4000 // Set user id on execution
+S_ISGID :: 0o2000 // Set group id on execution
+S_ISTXT :: 0o1000 // Sticky bit
+
+S_ISLNK  :: #force_inline proc(m: u32) -> bool { return (m & S_IFMT) == S_IFLNK  }
+S_ISREG  :: #force_inline proc(m: u32) -> bool { return (m & S_IFMT) == S_IFREG  }
+S_ISDIR  :: #force_inline proc(m: u32) -> bool { return (m & S_IFMT) == S_IFDIR  }
+S_ISCHR  :: #force_inline proc(m: u32) -> bool { return (m & S_IFMT) == S_IFCHR  }
+S_ISBLK  :: #force_inline proc(m: u32) -> bool { return (m & S_IFMT) == S_IFBLK  }
+S_ISFIFO :: #force_inline proc(m: u32) -> bool { return (m & S_IFMT) == S_IFIFO  }
+S_ISSOCK :: #force_inline proc(m: u32) -> bool { return (m & S_IFMT) == S_IFSOCK }
+
+
+foreign libc {
+	@(link_name="_errnop")	__error		:: proc() -> ^c.int ---
+
+	@(link_name="fork")	_unix_fork	:: proc() -> pid_t ---
+	@(link_name="getthrid")	_unix_getthrid	:: proc() -> int ---
+
+	@(link_name="open")	_unix_open	:: proc(path: cstring, flags: c.int, mode: c.int) -> Handle ---
+	@(link_name="close")	_unix_close	:: proc(fd: Handle) -> c.int ---
+	@(link_name="read")	_unix_read	:: proc(fd: Handle, buf: rawptr, size: c.size_t) -> c.ssize_t ---
+	@(link_name="write")	_unix_write	:: proc(fd: Handle, buf: rawptr, size: c.size_t) -> c.ssize_t ---
+	@(link_name="lseek")	_unix_seek	:: proc(fd: Handle, offset: off_t, whence: c.int) -> off_t ---
+	@(link_name="stat")	_unix_stat	:: proc(path: cstring, sb: ^OS_Stat) -> c.int ---
+	@(link_name="fstat")	_unix_fstat	:: proc(fd: Handle, sb: ^OS_Stat) -> c.int ---
+	@(link_name="lstat")	_unix_lstat	:: proc(path: cstring, sb: ^OS_Stat) -> c.int ---
+	@(link_name="readlink")	_unix_readlink	:: proc(path: cstring, buf: ^byte, bufsiz: c.size_t) -> c.ssize_t ---
+	@(link_name="access")	_unix_access	:: proc(path: cstring, mask: c.int) -> c.int ---
+	@(link_name="getcwd")	_unix_getcwd	:: proc(buf: cstring, len: c.size_t) -> cstring ---
+	@(link_name="chdir")	_unix_chdir	:: proc(path: cstring) -> c.int ---
+	@(link_name="rename")	_unix_rename	:: proc(old, new: cstring) -> c.int ---
+	@(link_name="unlink")	_unix_unlink	:: proc(path: cstring) -> c.int ---
+	@(link_name="rmdir")	_unix_rmdir	:: proc(path: cstring) -> c.int ---
+	@(link_name="mkdir")	_unix_mkdir	:: proc(path: cstring, mode: mode_t) -> c.int ---
+
+	@(link_name="getpagesize") _unix_getpagesize :: proc() -> c.int ---
+	@(link_name="sysconf") _sysconf :: proc(name: c.int) -> c.long ---
+	@(link_name="fdopendir") _unix_fdopendir :: proc(fd: Handle) -> Dir ---
+	@(link_name="closedir")	_unix_closedir	:: proc(dirp: Dir) -> c.int ---
+	@(link_name="rewinddir") _unix_rewinddir :: proc(dirp: Dir) ---
+	@(link_name="readdir_r") _unix_readdir_r :: proc(dirp: Dir, entry: ^Dirent, result: ^^Dirent) -> c.int ---
+
+	@(link_name="malloc")	_unix_malloc	:: proc(size: c.size_t) -> rawptr ---
+	@(link_name="calloc")	_unix_calloc	:: proc(num, size: c.size_t) -> rawptr ---
+	@(link_name="free")	_unix_free	:: proc(ptr: rawptr) ---
+	@(link_name="realloc")	_unix_realloc	:: proc(ptr: rawptr, size: c.size_t) -> rawptr ---
+
+	@(link_name="getenv")	_unix_getenv	:: proc(cstring) -> cstring ---
+	@(link_name="realpath")	_unix_realpath	:: proc(path: cstring, resolved_path: rawptr) -> rawptr ---
+
+	@(link_name="exit")	_unix_exit	:: proc(status: c.int) -> ! ---
+
+	@(link_name="dlopen")	_unix_dlopen	:: proc(filename: cstring, flags: c.int) -> rawptr ---
+	@(link_name="dlsym")	_unix_dlsym	:: proc(handle: rawptr, symbol: cstring) -> rawptr ---
+	@(link_name="dlclose")	_unix_dlclose	:: proc(handle: rawptr) -> c.int ---
+	@(link_name="dlerror")	_unix_dlerror	:: proc() -> cstring ---
+}
+
+MAXNAMLEN :: haiku.NAME_MAX
+
+Dirent :: struct {
+	dev:      dev_t,
+	pdef:     dev_t,
+	ino:      ino_t,
+	pino:     ino_t,
+	reclen:   u16,
+	name:     [MAXNAMLEN + 1]byte, // name
+}
+
+Dir :: distinct rawptr // DIR*
+
+is_path_separator :: proc(r: rune) -> bool {
+	return r == '/'
+}
+
+get_last_error :: proc "contextless" () -> int {
+	return int(__error()^)
+}
+
+fork :: proc() -> (Pid, Errno) {
+	pid := _unix_fork()
+	if pid == -1 {
+		return Pid(-1), Errno(get_last_error())
+	}
+	return Pid(pid), ERROR_NONE
+}
+
+open :: proc(path: string, flags: int = O_RDONLY, mode: int = 0) -> (Handle, Errno) {
+	runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD()
+	cstr := strings.clone_to_cstring(path, context.temp_allocator)
+	handle := _unix_open(cstr, c.int(flags), c.int(mode))
+	if handle == -1 {
+		return INVALID_HANDLE, Errno(get_last_error())
+	}
+	return handle, ERROR_NONE
+}
+
+close :: proc(fd: Handle) -> Errno {
+	result := _unix_close(fd)
+	if result == -1 {
+		return Errno(get_last_error())
+	}
+	return ERROR_NONE
+}
+
+// In practice a read/write call would probably never read/write these big buffers all at once,
+// which is why the number of bytes is returned and why there are procs that will call this in a
+// loop for you.
+// We set a max of 1GB to keep alignment and to be safe.
+@(private)
+MAX_RW :: 1 << 30
+
+read :: proc(fd: Handle, data: []byte) -> (int, Errno) {
+	to_read    := min(c.size_t(len(data)), MAX_RW)
+	bytes_read := _unix_read(fd, &data[0], to_read)
+	if bytes_read == -1 {
+		return -1, Errno(get_last_error())
+	}
+	return int(bytes_read), ERROR_NONE
+}
+
+write :: proc(fd: Handle, data: []byte) -> (int, Errno) {
+	if len(data) == 0 {
+		return 0, ERROR_NONE
+	}
+
+	to_write      := min(c.size_t(len(data)), MAX_RW)
+	bytes_written := _unix_write(fd, &data[0], to_write)
+	if bytes_written == -1 {
+		return -1, Errno(get_last_error())
+	}
+	return int(bytes_written), ERROR_NONE
+}
+
+seek :: proc(fd: Handle, offset: i64, whence: int) -> (i64, Errno) {
+	res := _unix_seek(fd, offset, c.int(whence))
+	if res == -1 {
+		return -1, Errno(get_last_error())
+	}
+	return res, ERROR_NONE
+}
+
+file_size :: proc(fd: Handle) -> (i64, Errno) {
+	s, err := _fstat(fd)
+	if err != ERROR_NONE {
+		return -1, err
+	}
+	return s.size, ERROR_NONE
+}
+
+// "Argv" arguments converted to Odin strings
+args := _alloc_command_line_arguments()
+
+_alloc_command_line_arguments :: proc() -> []string {
+	res := make([]string, len(runtime.args__))
+	for arg, i in runtime.args__ {
+		res[i] = string(arg)
+	}
+	return res
+}
+
+@private
+_stat :: proc(path: string) -> (OS_Stat, Errno) {
+	runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD()
+	cstr := strings.clone_to_cstring(path, context.temp_allocator)
+
+	// deliberately uninitialized
+	s: OS_Stat = ---
+	res := _unix_stat(cstr, &s)
+	if res == -1 {
+		return s, Errno(get_last_error())
+	}
+	return s, ERROR_NONE
+}
+
+@private
+_lstat :: proc(path: string) -> (OS_Stat, Errno) {
+	runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD()
+	cstr := strings.clone_to_cstring(path, context.temp_allocator)
+
+	// deliberately uninitialized
+	s: OS_Stat = ---
+	res := _unix_lstat(cstr, &s)
+	if res == -1 {
+		return s, Errno(get_last_error())
+	}
+	return s, ERROR_NONE
+}
+
+@private
+_fstat :: proc(fd: Handle) -> (OS_Stat, Errno) {
+	// deliberately uninitialized
+	s: OS_Stat = ---
+	res := _unix_fstat(fd, &s)
+	if res == -1 {
+		return s, Errno(get_last_error())
+	}
+	return s, ERROR_NONE
+}
+
+@private
+_fdopendir :: proc(fd: Handle) -> (Dir, Errno) {
+	dirp := _unix_fdopendir(fd)
+	if dirp == cast(Dir)nil {
+		return nil, Errno(get_last_error())
+	}
+	return dirp, ERROR_NONE
+}
+
+@private
+_closedir :: proc(dirp: Dir) -> Errno {
+	rc := _unix_closedir(dirp)
+	if rc != 0 {
+		return Errno(get_last_error())
+	}
+	return ERROR_NONE
+}
+
+@private
+_rewinddir :: proc(dirp: Dir) {
+	_unix_rewinddir(dirp)
+}
+
+@private
+_readdir :: proc(dirp: Dir) -> (entry: Dirent, err: Errno, end_of_stream: bool) {
+	result: ^Dirent
+	rc := _unix_readdir_r(dirp, &entry, &result)
+
+	if rc != 0 {
+		err = Errno(get_last_error())
+		return
+	}
+	err = ERROR_NONE
+
+	if result == nil {
+		end_of_stream = true
+		return
+	}
+
+	return
+}
+
+@private
+_readlink :: proc(path: string) -> (string, Errno) {
+	runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD(ignore = context.temp_allocator == context.allocator)
+	path_cstr := strings.clone_to_cstring(path, context.temp_allocator)
+
+	bufsz : uint = MAX_PATH
+	buf := make([]byte, MAX_PATH)
+	for {
+		rc := _unix_readlink(path_cstr, &(buf[0]), bufsz)
+		if rc == -1 {
+			delete(buf)
+			return "", Errno(get_last_error())
+		} else if rc == int(bufsz) {
+			bufsz += MAX_PATH
+			delete(buf)
+			buf = make([]byte, bufsz)
+		} else {
+			return strings.string_from_ptr(&buf[0], rc), ERROR_NONE
+		}	
+	}
+}
+
+absolute_path_from_handle :: proc(fd: Handle) -> (string, Errno) {
+	return "", Errno(ENOSYS)
+}
+
+absolute_path_from_relative :: proc(rel: string) -> (path: string, err: Errno) {
+	rel := rel
+	if rel == "" {
+		rel = "."
+	}
+
+	runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD(ignore = context.temp_allocator == context.allocator)
+	rel_cstr := strings.clone_to_cstring(rel, context.temp_allocator)
+
+	path_ptr := _unix_realpath(rel_cstr, nil)
+	if path_ptr == nil {
+		return "", Errno(get_last_error())
+	}
+	defer _unix_free(path_ptr)
+
+	path_cstr := transmute(cstring)path_ptr
+	path = strings.clone( string(path_cstr) )
+
+	return path, ERROR_NONE
+}
+
+access :: proc(path: string, mask: int) -> (bool, Errno) {
+	runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD()
+	cstr := strings.clone_to_cstring(path, context.temp_allocator)
+	res := _unix_access(cstr, c.int(mask))
+	if res == -1 {
+		return false, Errno(get_last_error())
+	}
+	return true, ERROR_NONE
+}
+
+lookup_env :: proc(key: string, allocator := context.allocator) -> (value: string, found: bool) {
+	runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD(ignore = context.temp_allocator == allocator)
+	path_str := strings.clone_to_cstring(key, context.temp_allocator)
+	cstr := _unix_getenv(path_str)
+	if cstr == nil {
+		return "", false
+	}
+	return strings.clone(string(cstr), allocator), true
+}
+
+get_env :: proc(key: string, allocator := context.allocator) -> (value: string) {
+	value, _ = lookup_env(key, allocator)
+	return
+}
+
+@(private)
+_processor_core_count :: proc() -> int {
+	info: haiku.system_info
+	haiku.get_system_info(&info)
+	return int(info.cpu_count)
+}
+
+exit :: proc "contextless" (code: int) -> ! {
+	runtime._cleanup_runtime_contextless()
+	_unix_exit(i32(code))
+}
diff --git a/core/os/stat_unix.odin b/core/os/stat_unix.odin
index dae7ab2fb..5e83c0e16 100644
--- a/core/os/stat_unix.odin
+++ b/core/os/stat_unix.odin
@@ -1,4 +1,4 @@
-//+build linux, darwin, freebsd, openbsd
+//+build linux, darwin, freebsd, openbsd, haiku
 package os
 
 import "core:time"
diff --git a/core/os/stream.odin b/core/os/stream.odin
index d7ce11d26..25f31218c 100644
--- a/core/os/stream.odin
+++ b/core/os/stream.odin
@@ -32,7 +32,7 @@ _file_stream_proc :: proc(stream_data: rawptr, mode: io.Stream_Mode, p: []byte,
 		}
 
 	case .Read_At:
-		when !(ODIN_OS == .FreeBSD || ODIN_OS == .OpenBSD) {
+		when !(ODIN_OS == .FreeBSD || ODIN_OS == .OpenBSD || ODIN_OS == .Haiku) {
 			n_int, os_err = read_at(fd, p, offset)
 			n = i64(n_int)
 			if n == 0 && os_err == 0 {
@@ -46,7 +46,7 @@ _file_stream_proc :: proc(stream_data: rawptr, mode: io.Stream_Mode, p: []byte,
 			err = .EOF
 		}
 	case .Write_At:
-		when !(ODIN_OS == .FreeBSD || ODIN_OS == .OpenBSD) {
+		when !(ODIN_OS == .FreeBSD || ODIN_OS == .OpenBSD || ODIN_OS == .Haiku) {
 			n_int, os_err = write_at(fd, p, offset)
 			n = i64(n_int)
 			if n == 0 && os_err == 0 {
@@ -60,7 +60,7 @@ _file_stream_proc :: proc(stream_data: rawptr, mode: io.Stream_Mode, p: []byte,
 	case .Destroy:
 		err = .Empty
 	case .Query:
-		when ODIN_OS == .FreeBSD || ODIN_OS == .OpenBSD {
+		when ODIN_OS == .FreeBSD || ODIN_OS == .OpenBSD || ODIN_OS == .Haiku {
 			return io.query_utility({.Close, .Flush, .Read, .Write, .Seek, .Size, .Query})
 		} else {
 			return io.query_utility({.Close, .Flush, .Read, .Read_At, .Write, .Write_At, .Seek, .Size, .Query})
diff --git a/core/sync/futex_haiku.odin b/core/sync/futex_haiku.odin
new file mode 100644
index 000000000..1dd719e7a
--- /dev/null
+++ b/core/sync/futex_haiku.odin
@@ -0,0 +1,167 @@
+//+private
+package sync
+
+import "core:c"
+import "core:runtime"
+import "core:sys/haiku"
+import "core:sys/unix"
+import "core:time"
+
+@(private="file")
+Wait_Node :: struct {
+	thread:     unix.pthread_t,
+	futex:      ^Futex,
+	prev, next: ^Wait_Node,
+}
+@(private="file")
+atomic_flag :: distinct bool
+@(private="file")
+Wait_Queue :: struct {
+	lock: atomic_flag,
+	list: Wait_Node,
+}
+@(private="file")
+waitq_lock :: proc "contextless" (waitq: ^Wait_Queue) {
+	for cast(bool)atomic_exchange_explicit(&waitq.lock, atomic_flag(true), .Acquire) {
+		cpu_relax() // spin...
+	}
+}
+@(private="file")
+waitq_unlock :: proc "contextless" (waitq: ^Wait_Queue) {
+	atomic_store_explicit(&waitq.lock, atomic_flag(false), .Release)
+}
+
+// FIXME: This approach may scale badly in the future,
+// possible solution - hash map (leads to deadlocks now).
+@(private="file")
+g_waitq: Wait_Queue
+
+@(init, private="file")
+g_waitq_init :: proc() {
+	g_waitq = {
+		list = {
+			prev = &g_waitq.list,
+			next = &g_waitq.list,
+		},
+	}
+}
+
+@(private="file")
+get_waitq :: #force_inline proc "contextless" (f: ^Futex) -> ^Wait_Queue {
+	_ = f
+	return &g_waitq
+}
+
+_futex_wait :: proc "contextless" (f: ^Futex, expect: u32) -> (ok: bool) {
+	waitq := get_waitq(f)
+	waitq_lock(waitq)
+	defer waitq_unlock(waitq)
+
+	head   := &waitq.list
+	waiter := Wait_Node{
+		thread = unix.pthread_self(),
+		futex  = f,
+		prev   = head,
+		next   = head.next,
+	}
+
+	waiter.prev.next = &waiter
+	waiter.next.prev = &waiter
+
+	old_mask, mask: haiku.sigset_t
+	haiku.sigemptyset(&mask)
+	haiku.sigaddset(&mask, haiku.SIGCONT)
+	unix.pthread_sigmask(haiku.SIG_BLOCK, &mask, &old_mask)
+
+	if u32(atomic_load_explicit(f, .Acquire)) == expect {
+		waitq_unlock(waitq)
+		defer waitq_lock(waitq)
+		
+		sig: c.int
+		haiku.sigwait(&mask, &sig)
+		errno := haiku.errno() 
+		ok = errno == .OK
+	}
+
+	waiter.prev.next = waiter.next
+	waiter.next.prev = waiter.prev
+
+ 	unix.pthread_sigmask(haiku.SIG_SETMASK, &old_mask, nil)
+
+ 	// FIXME: Add error handling!
+ 	return
+}
+
+_futex_wait_with_timeout :: proc "contextless" (f: ^Futex, expect: u32, duration: time.Duration) -> (ok: bool) {
+	if duration <= 0 {
+		return false
+	}
+	waitq := get_waitq(f)
+	waitq_lock(waitq)
+	defer waitq_unlock(waitq)
+
+	head   := &waitq.list
+	waiter := Wait_Node{
+		thread = unix.pthread_self(),
+		futex  = f,
+		prev   = head,
+		next   = head.next,
+	}
+
+	waiter.prev.next = &waiter
+	waiter.next.prev = &waiter
+
+	old_mask, mask: haiku.sigset_t
+	haiku.sigemptyset(&mask)
+	haiku.sigaddset(&mask, haiku.SIGCONT)
+	unix.pthread_sigmask(haiku.SIG_BLOCK, &mask, &old_mask)
+
+	if u32(atomic_load_explicit(f, .Acquire)) == expect {
+		waitq_unlock(waitq)
+		defer waitq_lock(waitq)
+		
+		info: haiku.siginfo_t
+		ts := unix.timespec{
+			tv_sec  = i64(duration / 1e9),
+			tv_nsec = i64(duration % 1e9),
+		}
+		haiku.sigtimedwait(&mask, &info, &ts)
+		errno := haiku.errno() 
+		ok = errno == .EAGAIN || errno == .OK
+	}
+
+	waiter.prev.next = waiter.next
+	waiter.next.prev = waiter.prev
+
+ 	unix.pthread_sigmask(haiku.SIG_SETMASK, &old_mask, nil)
+
+ 	// FIXME: Add error handling!
+ 	return 
+}
+
+_futex_signal :: proc "contextless" (f: ^Futex) {
+	waitq := get_waitq(f)
+	waitq_lock(waitq)
+	defer waitq_unlock(waitq)
+
+	head := &waitq.list
+	for waiter := head.next; waiter != head; waiter = waiter.next {
+		if waiter.futex == f {
+			unix.pthread_kill(waiter.thread, haiku.SIGCONT)
+			break
+		}
+	}
+}
+
+_futex_broadcast :: proc "contextless" (f: ^Futex) {
+	waitq := get_waitq(f)
+	waitq_lock(waitq)
+	defer waitq_unlock(waitq)
+
+	head := &waitq.list
+	for waiter := head.next; waiter != head; waiter = waiter.next {
+		if waiter.futex == f {
+			unix.pthread_kill(waiter.thread, haiku.SIGCONT)
+		}
+	}
+}
diff --git a/core/sync/primitives_haiku.odin b/core/sync/primitives_haiku.odin
new file mode 100644
index 000000000..4b8f6b02d
--- /dev/null
+++ b/core/sync/primitives_haiku.odin
@@ -0,0 +1,8 @@
+//+private
+package sync
+
+import "core:sys/haiku"
+
+_current_thread_id :: proc "contextless" () -> int {
+	return int(haiku.find_thread(nil))
+}
diff --git a/core/sys/haiku/errors.odin b/core/sys/haiku/errors.odin
new file mode 100644
index 000000000..023045001
--- /dev/null
+++ b/core/sys/haiku/errors.odin
@@ -0,0 +1,239 @@
+//+build haiku
+package sys_haiku
+
+import "core:c"
+
+Errno :: enum c.int {
+	// Error baselines
+	GENERAL_ERROR_BASE     = min(c.int),
+	OS_ERROR_BASE          = GENERAL_ERROR_BASE + 0x1000,
+	APP_ERROR_BASE         = GENERAL_ERROR_BASE + 0x2000,
+	INTERFACE_ERROR_BASE   = GENERAL_ERROR_BASE + 0x3000,
+	MEDIA_ERROR_BASE       = GENERAL_ERROR_BASE + 0x4000,
+	TRANSLATION_ERROR_BASE = GENERAL_ERROR_BASE + 0x4800,
+	MIDI_ERROR_BASE        = GENERAL_ERROR_BASE + 0x5000,
+	STORAGE_ERROR_BASE     = GENERAL_ERROR_BASE + 0x6000,
+	POSIX_ERROR_BASE       = GENERAL_ERROR_BASE + 0x7000,
+	MAIL_ERROR_BASE        = GENERAL_ERROR_BASE + 0x8000,
+	PRINT_ERROR_BASE       = GENERAL_ERROR_BASE + 0x9000,
+	DEVICE_ERROR_BASE      = GENERAL_ERROR_BASE + 0xa000,
+
+	// Developer-defined errors start at (ERRORS_END+1)
+	ERRORS_END             = GENERAL_ERROR_BASE + 0xffff,
+
+	// General Errors
+	NO_MEMORY              = GENERAL_ERROR_BASE + 0,
+	IO_ERROR               = GENERAL_ERROR_BASE + 1,
+	PERMISSION_DENIED      = GENERAL_ERROR_BASE + 2,
+	BAD_INDEX              = GENERAL_ERROR_BASE + 3,
+	BAD_TYPE               = GENERAL_ERROR_BASE + 4,
+	BAD_VALUE              = GENERAL_ERROR_BASE + 5,
+	MISMATCHED_VALUES      = GENERAL_ERROR_BASE + 6,
+	NAME_NOT_FOUND         = GENERAL_ERROR_BASE + 7,
+	NAME_IN_USE            = GENERAL_ERROR_BASE + 8,
+	TIMED_OUT              = GENERAL_ERROR_BASE + 9,
+	INTERRUPTED            = GENERAL_ERROR_BASE + 10,
+	WOULD_BLOCK            = GENERAL_ERROR_BASE + 11,
+	CANCELED               = GENERAL_ERROR_BASE + 12,
+	NO_INIT                = GENERAL_ERROR_BASE + 13,
+	NOT_INITIALIZED        = GENERAL_ERROR_BASE + 13,
+	BUSY                   = GENERAL_ERROR_BASE + 14,
+	NOT_ALLOWED            = GENERAL_ERROR_BASE + 15,
+	BAD_DATA               = GENERAL_ERROR_BASE + 16,
+	DONT_DO_THAT           = GENERAL_ERROR_BASE + 17,
+
+	ERROR                  = -1,
+	OK                     = 0,
+	NO_ERROR               = 0,
+
+	// Kernel Kit Errors
+	BAD_SEM_ID                        = OS_ERROR_BASE + 0,
+	NO_MORE_SEMS                      = OS_ERROR_BASE + 1,
+	BAD_THREAD_ID                     = OS_ERROR_BASE + 0x100,
+	NO_MORE_THREADS                   = OS_ERROR_BASE + 0x101,
+	BAD_THREAD_STATE                  = OS_ERROR_BASE + 0x102,
+	BAD_TEAM_ID                       = OS_ERROR_BASE + 0x103,
+	NO_MORE_TEAMS                     = OS_ERROR_BASE + 0x104,
+	BAD_PORT_ID                       = OS_ERROR_BASE + 0x200,
+	NO_MORE_PORTS                     = OS_ERROR_BASE + 0x201,
+	BAD_IMAGE_ID                      = OS_ERROR_BASE + 0x300,
+	BAD_ADDRESS                       = OS_ERROR_BASE + 0x301,
+	NOT_AN_EXECUTABLE                 = OS_ERROR_BASE + 0x302,
+	MISSING_LIBRARY                   = OS_ERROR_BASE + 0x303,
+	MISSING_SYMBOL                    = OS_ERROR_BASE + 0x304,
+	UNKNOWN_EXECUTABLE                = OS_ERROR_BASE + 0x305,
+	LEGACY_EXECUTABLE                 = OS_ERROR_BASE + 0x306,
+
+	DEBUGGER_ALREADY_INSTALLED        = OS_ERROR_BASE + 0x400,
+
+	// Application Kit Errors
+	BAD_REPLY                         = APP_ERROR_BASE + 0,
+	DUPLICATE_REPLY                   = APP_ERROR_BASE + 1,
+	MESSAGE_TO_SELF                   = APP_ERROR_BASE + 2,
+	BAD_HANDLER                       = APP_ERROR_BASE + 3,
+	ALREADY_RUNNING                   = APP_ERROR_BASE + 4,
+	LAUNCH_FAILED                     = APP_ERROR_BASE + 5,
+	AMBIGUOUS_APP_LAUNCH              = APP_ERROR_BASE + 6,
+	UNKNOWN_MIME_TYPE                 = APP_ERROR_BASE + 7,
+	BAD_SCRIPT_SYNTAX                 = APP_ERROR_BASE + 8,
+	LAUNCH_FAILED_NO_RESOLVE_LINK     = APP_ERROR_BASE + 9,
+	LAUNCH_FAILED_EXECUTABLE          = APP_ERROR_BASE + 10,
+	LAUNCH_FAILED_APP_NOT_FOUND       = APP_ERROR_BASE + 11,
+	LAUNCH_FAILED_APP_IN_TRASH        = APP_ERROR_BASE + 12,
+	LAUNCH_FAILED_NO_PREFERRED_APP    = APP_ERROR_BASE + 13,
+	LAUNCH_FAILED_FILES_APP_NOT_FOUND = APP_ERROR_BASE + 14,
+	BAD_MIME_SNIFFER_RULE             = APP_ERROR_BASE + 15,
+	NOT_A_MESSAGE                     = APP_ERROR_BASE + 16,
+	SHUTDOWN_CANCELLED                = APP_ERROR_BASE + 17,
+	SHUTTING_DOWN                     = APP_ERROR_BASE + 18,
+
+	// Storage Kit/File System Errors
+	FILE_ERROR                        = STORAGE_ERROR_BASE + 0,
+	// 1 was B_FILE_NOT_FOUND (deprecated)
+	FILE_EXISTS                       = STORAGE_ERROR_BASE + 2,
+	ENTRY_NOT_FOUND                   = STORAGE_ERROR_BASE + 3,
+	NAME_TOO_LONG                     = STORAGE_ERROR_BASE + 4,
+	NOT_A_DIRECTORY                   = STORAGE_ERROR_BASE + 5,
+	DIRECTORY_NOT_EMPTY               = STORAGE_ERROR_BASE + 6,
+	DEVICE_FULL                       = STORAGE_ERROR_BASE + 7,
+	READ_ONLY_DEVICE                  = STORAGE_ERROR_BASE + 8,
+	IS_A_DIRECTORY                    = STORAGE_ERROR_BASE + 9,
+	NO_MORE_FDS                       = STORAGE_ERROR_BASE + 10,
+	CROSS_DEVICE_LINK                 = STORAGE_ERROR_BASE + 11,
+	LINK_LIMIT                        = STORAGE_ERROR_BASE + 12,
+	BUSTED_PIPE                       = STORAGE_ERROR_BASE + 13,
+	UNSUPPORTED                       = STORAGE_ERROR_BASE + 14,
+	PARTITION_TOO_SMALL               = STORAGE_ERROR_BASE + 15,
+	PARTIAL_READ                      = STORAGE_ERROR_BASE + 16,
+	PARTIAL_WRITE                     = STORAGE_ERROR_BASE + 17,
+
+	// Some POSIX errors
+	E2BIG                             = POSIX_ERROR_BASE + 1,
+	EFBIG                             = POSIX_ERROR_BASE + 4,
+	ENODEV                            = POSIX_ERROR_BASE + 7,
+	ERANGE                            = POSIX_ERROR_BASE + 17,
+	EOVERFLOW                         = POSIX_ERROR_BASE + 41,
+	EOPNOTSUPP                        = POSIX_ERROR_BASE + 43,
+
+	ENOSYS                            = POSIX_ERROR_BASE + 9,
+	EAGAIN                            = WOULD_BLOCK,
+
+	// New error codes that can be mapped to POSIX errors
+	TOO_MANY_ARGS_NEG                 = E2BIG,
+	FILE_TOO_LARGE_NEG                = EFBIG,
+	DEVICE_NOT_FOUND_NEG              = ENODEV,
+	RESULT_NOT_REPRESENTABLE_NEG      = ERANGE,
+	BUFFER_OVERFLOW_NEG               = EOVERFLOW,
+	NOT_SUPPORTED_NEG                 = EOPNOTSUPP,
+
+	TOO_MANY_ARGS_POS                 = -E2BIG,
+	FILE_TOO_LARGE_POS                = -EFBIG,
+	DEVICE_NOT_FOUND_POS              = -ENODEV,
+	RESULT_NOT_REPRESENTABLE_POS      = -ERANGE,
+	BUFFER_OVERFLOW_POS               = -EOVERFLOW,
+	NOT_SUPPORTED_POS                 = -EOPNOTSUPP,
+
+	// Media Kit Errors
+	STREAM_NOT_FOUND             = MEDIA_ERROR_BASE + 0,
+	SERVER_NOT_FOUND             = MEDIA_ERROR_BASE + 1,
+	RESOURCE_NOT_FOUND           = MEDIA_ERROR_BASE + 2,
+	RESOURCE_UNAVAILABLE         = MEDIA_ERROR_BASE + 3,
+	BAD_SUBSCRIBER               = MEDIA_ERROR_BASE + 4,
+	SUBSCRIBER_NOT_ENTERED       = MEDIA_ERROR_BASE + 5,
+	BUFFER_NOT_AVAILABLE         = MEDIA_ERROR_BASE + 6,
+	LAST_BUFFER_ERROR            = MEDIA_ERROR_BASE + 7,
+	MEDIA_SYSTEM_FAILURE         = MEDIA_ERROR_BASE + 100,
+	MEDIA_BAD_NODE               = MEDIA_ERROR_BASE + 101,
+	MEDIA_NODE_BUSY              = MEDIA_ERROR_BASE + 102,
+	MEDIA_BAD_FORMAT             = MEDIA_ERROR_BASE + 103,
+	MEDIA_BAD_BUFFER             = MEDIA_ERROR_BASE + 104,
+	MEDIA_TOO_MANY_NODES         = MEDIA_ERROR_BASE + 105,
+	MEDIA_TOO_MANY_BUFFERS       = MEDIA_ERROR_BASE + 106,
+	MEDIA_NODE_ALREADY_EXISTS    = MEDIA_ERROR_BASE + 107,
+	MEDIA_BUFFER_ALREADY_EXISTS  = MEDIA_ERROR_BASE + 108,
+	MEDIA_CANNOT_SEEK            = MEDIA_ERROR_BASE + 109,
+	MEDIA_CANNOT_CHANGE_RUN_MODE = MEDIA_ERROR_BASE + 110,
+	MEDIA_APP_ALREADY_REGISTERED = MEDIA_ERROR_BASE + 111,
+	MEDIA_APP_NOT_REGISTERED     = MEDIA_ERROR_BASE + 112,
+	MEDIA_CANNOT_RECLAIM_BUFFERS = MEDIA_ERROR_BASE + 113,
+	MEDIA_BUFFERS_NOT_RECLAIMED  = MEDIA_ERROR_BASE + 114,
+	MEDIA_TIME_SOURCE_STOPPED    = MEDIA_ERROR_BASE + 115,
+	MEDIA_TIME_SOURCE_BUSY       = MEDIA_ERROR_BASE + 116,
+	MEDIA_BAD_SOURCE             = MEDIA_ERROR_BASE + 117,
+	MEDIA_BAD_DESTINATION        = MEDIA_ERROR_BASE + 118,
+	MEDIA_ALREADY_CONNECTED      = MEDIA_ERROR_BASE + 119,
+	MEDIA_NOT_CONNECTED          = MEDIA_ERROR_BASE + 120,
+	MEDIA_BAD_CLIP_FORMAT        = MEDIA_ERROR_BASE + 121,
+	MEDIA_ADDON_FAILED           = MEDIA_ERROR_BASE + 122,
+	MEDIA_ADDON_DISABLED         = MEDIA_ERROR_BASE + 123,
+	MEDIA_CHANGE_IN_PROGRESS     = MEDIA_ERROR_BASE + 124,
+	MEDIA_STALE_CHANGE_COUNT     = MEDIA_ERROR_BASE + 125,
+	MEDIA_ADDON_RESTRICTED       = MEDIA_ERROR_BASE + 126,
+	MEDIA_NO_HANDLER             = MEDIA_ERROR_BASE + 127,
+	MEDIA_DUPLICATE_FORMAT       = MEDIA_ERROR_BASE + 128,
+	MEDIA_REALTIME_DISABLED      = MEDIA_ERROR_BASE + 129,
+	MEDIA_REALTIME_UNAVAILABLE   = MEDIA_ERROR_BASE + 130,
+
+	// Mail Kit Errors
+	MAIL_NO_DAEMON               = MAIL_ERROR_BASE + 0,
+	MAIL_UNKNOWN_USER            = MAIL_ERROR_BASE + 1,
+	MAIL_WRONG_PASSWORD          = MAIL_ERROR_BASE + 2,
+	MAIL_UNKNOWN_HOST            = MAIL_ERROR_BASE + 3,
+	MAIL_ACCESS_ERROR            = MAIL_ERROR_BASE + 4,
+	MAIL_UNKNOWN_FIELD           = MAIL_ERROR_BASE + 5,
+	MAIL_NO_RECIPIENT            = MAIL_ERROR_BASE + 6,
+	MAIL_INVALID_MAIL            = MAIL_ERROR_BASE + 7,
+
+	// Printing Errors
+	NO_PRINT_SERVER              = PRINT_ERROR_BASE + 0,
+
+	// Device Kit Errors
+	DEV_INVALID_IOCTL            = DEVICE_ERROR_BASE + 0,
+	DEV_NO_MEMORY                = DEVICE_ERROR_BASE + 1,
+	DEV_BAD_DRIVE_NUM            = DEVICE_ERROR_BASE + 2,
+	DEV_NO_MEDIA                 = DEVICE_ERROR_BASE + 3,
+	DEV_UNREADABLE               = DEVICE_ERROR_BASE + 4,
+	DEV_FORMAT_ERROR             = DEVICE_ERROR_BASE + 5,
+	DEV_TIMEOUT                  = DEVICE_ERROR_BASE + 6,
+	DEV_RECALIBRATE_ERROR        = DEVICE_ERROR_BASE + 7,
+	DEV_SEEK_ERROR               = DEVICE_ERROR_BASE + 8,
+	DEV_ID_ERROR                 = DEVICE_ERROR_BASE + 9,
+	DEV_READ_ERROR               = DEVICE_ERROR_BASE + 10,
+	DEV_WRITE_ERROR              = DEVICE_ERROR_BASE + 11,
+	DEV_NOT_READY                = DEVICE_ERROR_BASE + 12,
+	DEV_MEDIA_CHANGED            = DEVICE_ERROR_BASE + 13,
+	DEV_MEDIA_CHANGE_REQUESTED   = DEVICE_ERROR_BASE + 14,
+	DEV_RESOURCE_CONFLICT        = DEVICE_ERROR_BASE + 15,
+	DEV_CONFIGURATION_ERROR      = DEVICE_ERROR_BASE + 16,
+	DEV_DISABLED_BY_USER         = DEVICE_ERROR_BASE + 17,
+	DEV_DOOR_OPEN                = DEVICE_ERROR_BASE + 18,
+	DEV_INVALID_PIPE             = DEVICE_ERROR_BASE + 19,
+	DEV_CRC_ERROR                = DEVICE_ERROR_BASE + 20,
+	DEV_STALLED                  = DEVICE_ERROR_BASE + 21,
+	DEV_BAD_PID                  = DEVICE_ERROR_BASE + 22,
+	DEV_UNEXPECTED_PID           = DEVICE_ERROR_BASE + 23,
+	DEV_DATA_OVERRUN             = DEVICE_ERROR_BASE + 24,
+	DEV_DATA_UNDERRUN            = DEVICE_ERROR_BASE + 25,
+	DEV_FIFO_OVERRUN             = DEVICE_ERROR_BASE + 26,
+	DEV_FIFO_UNDERRUN            = DEVICE_ERROR_BASE + 27,
+	DEV_PENDING                  = DEVICE_ERROR_BASE + 28,
+	DEV_MULTIPLE_ERRORS          = DEVICE_ERROR_BASE + 29,
+	DEV_TOO_LATE                 = DEVICE_ERROR_BASE + 30,
+
+	// Translation Kit Errors
+	TRANSLATION_BASE_ERROR       = TRANSLATION_ERROR_BASE + 0,
+	NO_TRANSLATOR                = TRANSLATION_ERROR_BASE + 1,
+	ILLEGAL_DATA                 = TRANSLATION_ERROR_BASE + 2,
+}
+
+errno :: #force_inline proc "contextless" () -> Errno {
+	return Errno(_errnop()^)
+}
+
+foreign import libroot "system:c"
+foreign libroot {
+	_to_positive_error :: proc(error: c.int) -> c.int ---
+	_to_negative_error :: proc(error: c.int) -> c.int ---
+
+	_errnop :: proc() -> ^c.int ---
+}
diff --git a/core/sys/haiku/find_directory.odin b/core/sys/haiku/find_directory.odin
new file mode 100644
index 000000000..103e677d7
--- /dev/null
+++ b/core/sys/haiku/find_directory.odin
@@ -0,0 +1,168 @@
+//+build haiku
+package sys_haiku
+
+import "core:c"
+
+directory_which :: enum c.int {
+	// Per volume directories
+	DESKTOP_DIRECTORY = 0,
+	TRASH_DIRECTORY,
+
+	// System directories
+	SYSTEM_DIRECTORY        = 1000,
+	SYSTEM_ADDONS_DIRECTORY = 1002,
+	SYSTEM_BOOT_DIRECTORY,
+	SYSTEM_FONTS_DIRECTORY,
+	SYSTEM_LIB_DIRECTORY,
+	SYSTEM_SERVERS_DIRECTORY,
+	SYSTEM_APPS_DIRECTORY,
+	SYSTEM_BIN_DIRECTORY,
+	SYSTEM_DOCUMENTATION_DIRECTORY = 1010,
+	SYSTEM_PREFERENCES_DIRECTORY,
+	SYSTEM_TRANSLATORS_DIRECTORY,
+	SYSTEM_MEDIA_NODES_DIRECTORY,
+	SYSTEM_SOUNDS_DIRECTORY,
+	SYSTEM_DATA_DIRECTORY,
+	SYSTEM_DEVELOP_DIRECTORY,
+	SYSTEM_PACKAGES_DIRECTORY,
+	SYSTEM_HEADERS_DIRECTORY,
+	SYSTEM_ETC_DIRECTORY      = 2008,
+	SYSTEM_SETTINGS_DIRECTORY = 2010,
+	SYSTEM_LOG_DIRECTORY      = 2012,
+	SYSTEM_SPOOL_DIRECTORY,
+	SYSTEM_TEMP_DIRECTORY,
+	SYSTEM_VAR_DIRECTORY,
+	SYSTEM_CACHE_DIRECTORY       = 2020,
+	SYSTEM_NONPACKAGED_DIRECTORY = 2023,
+	SYSTEM_NONPACKAGED_ADDONS_DIRECTORY,
+	SYSTEM_NONPACKAGED_TRANSLATORS_DIRECTORY,
+	SYSTEM_NONPACKAGED_MEDIA_NODES_DIRECTORY,
+	SYSTEM_NONPACKAGED_BIN_DIRECTORY,
+	SYSTEM_NONPACKAGED_DATA_DIRECTORY,
+	SYSTEM_NONPACKAGED_FONTS_DIRECTORY,
+	SYSTEM_NONPACKAGED_SOUNDS_DIRECTORY,
+	SYSTEM_NONPACKAGED_DOCUMENTATION_DIRECTORY,
+	SYSTEM_NONPACKAGED_LIB_DIRECTORY,
+	SYSTEM_NONPACKAGED_HEADERS_DIRECTORY,
+	SYSTEM_NONPACKAGED_DEVELOP_DIRECTORY,
+
+	// User directories. These are interpreted in the context of the user making the find_directory call.
+	USER_DIRECTORY = 3000,
+	USER_CONFIG_DIRECTORY,
+	USER_ADDONS_DIRECTORY,
+	USER_BOOT_DIRECTORY,
+	USER_FONTS_DIRECTORY,
+	USER_LIB_DIRECTORY,
+	USER_SETTINGS_DIRECTORY,
+	USER_DESKBAR_DIRECTORY,
+	USER_PRINTERS_DIRECTORY,
+	USER_TRANSLATORS_DIRECTORY,
+	USER_MEDIA_NODES_DIRECTORY,
+	USER_SOUNDS_DIRECTORY,
+	USER_DATA_DIRECTORY,
+	USER_CACHE_DIRECTORY,
+	USER_PACKAGES_DIRECTORY,
+	USER_HEADERS_DIRECTORY,
+	USER_NONPACKAGED_DIRECTORY,
+	USER_NONPACKAGED_ADDONS_DIRECTORY,
+	USER_NONPACKAGED_TRANSLATORS_DIRECTORY,
+	USER_NONPACKAGED_MEDIA_NODES_DIRECTORY,
+	USER_NONPACKAGED_BIN_DIRECTORY,
+	USER_NONPACKAGED_DATA_DIRECTORY,
+	USER_NONPACKAGED_FONTS_DIRECTORY,
+	USER_NONPACKAGED_SOUNDS_DIRECTORY,
+	USER_NONPACKAGED_DOCUMENTATION_DIRECTORY,
+	USER_NONPACKAGED_LIB_DIRECTORY,
+	USER_NONPACKAGED_HEADERS_DIRECTORY,
+	USER_NONPACKAGED_DEVELOP_DIRECTORY,
+	USER_DEVELOP_DIRECTORY,
+	USER_DOCUMENTATION_DIRECTORY,
+	USER_SERVERS_DIRECTORY,
+	USER_APPS_DIRECTORY,
+	USER_BIN_DIRECTORY,
+	USER_PREFERENCES_DIRECTORY,
+	USER_ETC_DIRECTORY,
+	USER_LOG_DIRECTORY,
+	USER_SPOOL_DIRECTORY,
+	USER_VAR_DIRECTORY,
+
+	// Global directories
+	APPS_DIRECTORY = 4000,
+	PREFERENCES_DIRECTORY,
+	UTILITIES_DIRECTORY,
+	PACKAGE_LINKS_DIRECTORY,
+
+	// Obsolete: Legacy BeOS definition to be phased out
+	BEOS_DIRECTORY = 1000,
+	BEOS_SYSTEM_DIRECTORY,
+	BEOS_ADDONS_DIRECTORY,
+	BEOS_BOOT_DIRECTORY,
+	BEOS_FONTS_DIRECTORY,
+	BEOS_LIB_DIRECTORY,
+	BEOS_SERVERS_DIRECTORY,
+	BEOS_APPS_DIRECTORY,
+	BEOS_BIN_DIRECTORY,
+	BEOS_ETC_DIRECTORY,
+	BEOS_DOCUMENTATION_DIRECTORY,
+	BEOS_PREFERENCES_DIRECTORY,
+	BEOS_TRANSLATORS_DIRECTORY,
+	BEOS_MEDIA_NODES_DIRECTORY,
+	BEOS_SOUNDS_DIRECTORY,
+}
+
+find_path_flags :: enum c.int {
+	CREATE_DIRECTORY        = 0x0001,
+	CREATE_PARENT_DIRECTORY = 0x0002,
+	EXISTING_ONLY           = 0x0004,
+	
+	// find_paths() only!
+	SYSTEM_ONLY             = 0x0010,
+	USER_ONLY               = 0x0020,
+}
+
+path_base_directory :: enum c.int {
+	INSTALLATION_LOCATION_DIRECTORY,
+	ADD_ONS_DIRECTORY,
+	APPS_DIRECTORY,
+	BIN_DIRECTORY,
+	BOOT_DIRECTORY,
+	CACHE_DIRECTORY,
+	DATA_DIRECTORY,
+	DEVELOP_DIRECTORY,
+	DEVELOP_LIB_DIRECTORY,
+	DOCUMENTATION_DIRECTORY,
+	ETC_DIRECTORY,
+	FONTS_DIRECTORY,
+	HEADERS_DIRECTORY,
+	LIB_DIRECTORY,
+	LOG_DIRECTORY,
+	MEDIA_NODES_DIRECTORY,
+	PACKAGES_DIRECTORY,
+	PREFERENCES_DIRECTORY,
+	SERVERS_DIRECTORY,
+	SETTINGS_DIRECTORY,
+	SOUNDS_DIRECTORY,
+	SPOOL_DIRECTORY,
+	TRANSLATORS_DIRECTORY,
+	VAR_DIRECTORY,
+
+	// find_path() only!
+	IMAGE_PATH = 1000,
+	PACKAGE_PATH,
+}
+
+// value that can be used instead of a pointer to a symbol in the program image
+APP_IMAGE_SYMBOL :: rawptr(addr_t(0))
+// pointer to a symbol in the callers image (same as B_CURRENT_IMAGE_SYMBOL)
+current_image_symbol :: proc() -> rawptr { return rawptr(current_image_symbol) }
+
+foreign import libroot "system:c"
+foreign libroot {
+	find_directory         :: proc(which: directory_which, volume: dev_t, createIt: bool, pathString: [^]c.char, length: i32) -> status_t ---
+	find_path              :: proc(codePointer: rawptr, baseDirectory: path_base_directory, subPath: cstring, pathBuffer: [^]c.char, bufferSize: c.size_t) -> status_t ---
+	find_path_etc          :: proc(codePointer: rawptr, dependency: cstring, architecture: cstring, baseDirectory: path_base_directory, subPath: cstring, flags: find_path_flags, pathBuffer: [^]c.char, bufferSize: c.size_t) -> status_t ---
+	find_path_for_path     :: proc(path: cstring, baseDirectory: path_base_directory, subPath: cstring, pathBuffer: [^]c.char, bufferSize: c.size_t) -> status_t ---
+	find_path_for_path_etc :: proc(path: cstring, dependency: cstring, architecture: cstring, baseDirectory: path_base_directory, subPath: cstring, flags: find_path_flags, pathBuffer: [^]c.char, bufferSize: c.size_t) -> status_t ---
+	find_paths             :: proc(baseDirectory: path_base_directory, subPath: cstring, _paths: ^[^][^]c.char, _pathCount: ^c.size_t) -> status_t ---
+	find_paths_etc         :: proc(architecture: cstring, baseDirectory: path_base_directory, subPath: cstring, flags: find_path_flags, _paths: ^[^][^]c.char, _pathCount: ^c.size_t) -> status_t ---
+}
diff --git a/core/sys/haiku/os.odin b/core/sys/haiku/os.odin
new file mode 100644
index 000000000..1e00145eb
--- /dev/null
+++ b/core/sys/haiku/os.odin
@@ -0,0 +1,502 @@
+//+build haiku
+package sys_haiku
+
+import "core:c"
+import "core:sys/unix"
+
+foreign import libroot "system:c"
+
+PATH_MAX   :: 1024
+NAME_MAX   :: 256
+MAXPATHLEN :: PATH_MAX
+
+FILE_NAME_LENGTH :: NAME_MAX
+PATH_NAME_LENGTH :: MAXPATHLEN
+OS_NAME_LENGTH   :: 32
+
+// Areas
+
+area_info :: struct {
+	area:       area_id,
+	name:       [OS_NAME_LENGTH]c.char,
+	size:       c.size_t,
+	lock:       u32,
+	protection: u32,
+	team:       team_id,
+	ram_size:   u32,
+	copy_count: u32,
+	in_count:   u32,
+	out_count:  u32,
+	address:    rawptr,
+}
+
+area_locking :: enum u32 {
+	NO_LOCK           = 0,
+	LAZY_LOCK         = 1,
+	FULL_LOCK         = 2,
+	CONTIGUOUS        = 3,
+	LOMEM             = 4, // CONTIGUOUS, < 16 MB physical address
+	_32_BIT_FULL_LOCK  = 5, // FULL_LOCK, < 4 GB physical addresses
+	_32_BIT_CONTIGUOUS = 6, // CONTIGUOUS, < 4 GB physical address
+}
+
+// for create_area() and clone_area()
+address_spec :: enum u32 {
+	ANY_ADDRESS             = 0,
+	EXACT_ADDRESS           = 1,
+	BASE_ADDRESS            = 2,
+	CLONE_ADDRESS           = 3,
+	ANY_KERNEL_ADDRESS      = 4,
+	// ANY_KERNEL_BLOCK_ADDRESS = 5,
+	RANDOMIZED_ANY_ADDRESS  = 6,
+	RANDOMIZED_BASE_ADDRESS = 7,
+}
+
+area_protection_flags :: enum u32 {
+	READ_AREA      = 1 << 0,
+	WRITE_AREA     = 1 << 1,
+	EXECUTE_AREA   = 1 << 2,
+	// "stack" protection is not available on most platforms - it's used
+	// to only commit memory as needed, and have guard pages at the
+	// bottom of the stack.
+	STACK_AREA     = 1 << 3,
+	CLONEABLE_AREA = 1 << 8,
+}
+
+foreign libroot {
+	create_area         :: proc(name: cstring, startAddress: ^rawptr, addressSpec: address_spec, size: c.size_t, lock: area_locking, protection: area_protection_flags) -> area_id ---
+	clone_area          :: proc(name: cstring, destAddress: ^rawptr, addressSpec: address_spec, protection: area_protection_flags, source: area_id) -> area_id ---
+	find_area           :: proc(name: cstring) -> area_id ---
+	area_for            :: proc(address: rawptr) -> area_id ---
+	delete_area         :: proc(id: area_id) -> status_t ---
+	resize_area         :: proc(id: area_id, newSize: c.size_t) -> status_t ---
+	set_area_protection :: proc(id: area_id, newProtection: area_protection_flags) -> status_t ---
+	_get_area_info      :: proc(id: area_id, areaInfo: ^area_info, size: c.size_t) -> status_t ---
+	_get_next_area_info :: proc(team: team_id, cookie: ^c.ssize_t, areaInfo: ^area_info, size: c.size_t) -> status_t ---
+}
+
+// Ports
+
+port_info :: struct {
+	port:        port_id,
+	team:        team_id,
+	name:        [OS_NAME_LENGTH]c.char,
+	capacity:    i32, // queue depth
+	queue_count: i32, // # msgs waiting to be read
+	total_count: i32, // total # msgs read so far
+}
+
+port_flags :: enum u32 {
+	USE_USER_MEMCPY   = 0x80000000,
+	// read the message, but don't remove it; kernel-only; memory must be locked
+	PEEK_PORT_MESSAGE = 0x100,
+}
+
+foreign libroot {
+	create_port          :: proc(capacity: i32, name: cstring) -> port_id ---
+	find_port            :: proc(name: cstring) -> port_id ---
+	read_port            :: proc(port: port_id, code: ^i32, buffer: rawptr, bufferSize: c.size_t) -> c.ssize_t ---
+	read_port_etc        :: proc(port: port_id, code: ^i32, buffer: rawptr, bufferSize: c.size_t, flags: port_flags, timeout: bigtime_t) -> c.ssize_t ---
+	write_port           :: proc(port: port_id, code: i32, buffer: rawptr, bufferSize: c.size_t) -> status_t ---
+	write_port_etc       :: proc(port: port_id, code: i32, buffer: rawptr, bufferSize: c.size_t, flags: port_flags, timeout: bigtime_t) -> status_t ---
+	close_port           :: proc(port: port_id) -> status_t ---
+	delete_port          :: proc(port: port_id) -> status_t ---
+	port_buffer_size     :: proc(port: port_id) -> c.ssize_t ---
+	port_buffer_size_etc :: proc(port: port_id, flags: port_flags, timeout: bigtime_t) -> c.ssize_t ---
+	port_count           :: proc(port: port_id) -> c.ssize_t ---
+	set_port_owner       :: proc(port: port_id, team: team_id) -> status_t ---
+	_get_port_info       :: proc(port: port_id, portInfo: ^port_info, portInfoSize: c.size_t) -> status_t ---
+	_get_next_port_info  :: proc(team: team_id, cookie: ^i32, portInfo: ^port_info, portInfoSize: c.size_t) -> status_t ---
+}
+
+// Semaphores
+
+sem_info :: struct {
+	sem:           sem_id,
+	team:          team_id,
+	name:          [OS_NAME_LENGTH]c.char,
+	count:         i32,
+	latest_holder: thread_id,
+}
+
+semaphore_flags :: enum u32 {
+	CAN_INTERRUPT      = 0x01, // acquisition of the semaphore can be interrupted (system use only)
+	CHECK_PERMISSION   = 0x04, // ownership will be checked (system use only)
+	KILL_CAN_INTERRUPT = 0x20, // acquisition of the semaphore can be interrupted by SIGKILL[THR], even if not CAN_INTERRUPT (system use only)
+	
+	// release_sem_etc() only flags
+	DO_NOT_RESCHEDULE       = 0x02, // thread is not rescheduled
+	RELEASE_ALL             = 0x08, // all waiting threads will be woken up, count will be zeroed
+	RELEASE_IF_WAITING_ONLY	= 0x10, // release count only if there are any threads waiting
+}
+
+foreign libroot {
+	create_sem         :: proc(count: i32, name: cstring) -> sem_id ---
+	delete_sem         :: proc(id: sem_id) -> status_t ---
+	acquire_sem        :: proc(id: sem_id) -> status_t ---
+	acquire_sem_etc    :: proc(id: sem_id, count: i32, flags: semaphore_flags, timeout: bigtime_t) -> status_t ---
+	release_sem        :: proc(id: sem_id) -> status_t ---
+	release_sem_etc    :: proc(id: sem_id, count: i32, flags: semaphore_flags) -> status_t ---
+	switch_sem         :: proc(semToBeReleased: sem_id) -> status_t ---
+	switch_sem_etc     :: proc(semToBeReleased: sem_id, id: sem_id, count: i32, flags: semaphore_flags, timeout: bigtime_t) -> status_t ---
+	get_sem_count      :: proc(id: sem_id, threadCount: ^i32) -> status_t ---
+	set_sem_owner      :: proc(id: sem_id, team: team_id) -> status_t ---
+	_get_sem_info      :: proc(id: sem_id, info: ^sem_info, infoSize: c.size_t) -> status_t ---
+	_get_next_sem_info :: proc(team: team_id, cookie: ^i32, info: ^sem_info, infoSize: c.size_t) -> status_t ---
+}
+
+// Teams
+
+team_info :: struct {
+	team:                team_id,
+	thread_count:        i32,
+	image_count:         i32,
+	area_count:          i32,
+	debugger_nub_thread: thread_id,
+	debugger_nub_port:   port_id,
+	argc:                i32,
+	args:                [64]c.char,
+	uid:                 uid_t,
+	gid:                 gid_t,
+
+	// Haiku R1 extensions
+	real_uid:            uid_t,
+	real_gid:            gid_t,
+	group_id:            pid_t,
+	session_id:          pid_t,
+	parent:              team_id,
+	name:                [OS_NAME_LENGTH]c.char,
+	start_time:          bigtime_t,
+}
+
+CURRENT_TEAM :: 0
+SYSTEM_TEAM  :: 1
+
+team_usage_info :: struct {
+	user_time:   bigtime_t,
+	kernel_time: bigtime_t,
+}
+
+team_usage_who :: enum i32 {
+	// compatible to sys/resource.h RUSAGE_SELF and RUSAGE_CHILDREN
+	SELF     = 0,
+	CHILDREN = -1,
+}
+
+foreign libroot {
+	// see also: send_signal()
+	kill_team            :: proc(team: team_id) -> status_t ---
+	_get_team_info       :: proc(id: team_id, info: ^team_info, size: c.size_t) -> status_t ---
+	_get_next_team_info  :: proc(cookie: ^i32, info: ^team_info, size: c.size_t) -> status_t ---
+	_get_team_usage_info :: proc(id: team_id, who: team_usage_who, info: ^team_usage_info, size: c.size_t) -> status_t ---
+}
+
+// Threads
+
+thread_state :: enum c.int {
+	RUNNING = 1,
+	READY,
+	RECEIVING,
+	ASLEEP,
+	SUSPENDED,
+	WAITING,
+}
+
+thread_info :: struct {
+	thread:      thread_id,
+	team:        team_id,
+	name:        [OS_NAME_LENGTH]c.char,
+	state:       thread_state,
+	priority:    thread_priority,
+	sem:         sem_id,
+	user_time:   bigtime_t,
+	kernel_time: bigtime_t,
+	stack_base:  rawptr,
+	stack_end:   rawptr,
+}
+
+thread_priority :: enum i32 {
+	IDLE_PRIORITY              = 0,
+	LOWEST_ACTIVE_PRIORITY     = 1,
+	LOW_PRIORITY               = 5,
+	NORMAL_PRIORITY            = 10,
+	DISPLAY_PRIORITY           = 15,
+	URGENT_DISPLAY_PRIORITY    = 20,
+	REAL_TIME_DISPLAY_PRIORITY = 100,
+	URGENT_PRIORITY            = 110,
+	REAL_TIME_PRIORITY         = 120,
+}
+
+FIRST_REAL_TIME_PRIORITY :: thread_priority.REAL_TIME_PRIORITY
+
+// time base for snooze_*(), compatible with the clockid_t constants defined in <time.h> 
+SYSTEM_TIMEBASE :: 0
+
+thread_func :: #type proc "c" (rawptr) -> status_t
+
+foreign libroot {
+	spawn_thread          :: proc(thread_func, name: cstring, priority: thread_priority, data: rawptr) -> thread_id ---
+	kill_thread           :: proc(thread: thread_id) -> status_t ---
+	resume_thread         :: proc(thread: thread_id) -> status_t ---
+	suspend_thread        :: proc(thread: thread_id) -> status_t ---
+	rename_thread         :: proc(thread: thread_id, newName: cstring) -> status_t ---
+	set_thread_priority   :: proc(thread: thread_id, newPriority: thread_priority) -> status_t ---
+	exit_thread           :: proc(status: status_t) ---
+	wait_for_thread       :: proc(thread: thread_id, returnValue: ^status_t) -> status_t ---
+	// FIXME: Find and define those flags.
+	wait_for_thread_etc   :: proc(id: thread_id, flags: u32, timeout: bigtime_t, _returnCode: ^status_t) -> status_t ---
+	on_exit_thread        :: proc(callback: proc "c" (rawptr), data: rawptr) -> status_t ---
+	find_thread           :: proc(name: cstring) -> thread_id ---
+	send_data             :: proc(thread: thread_id, code: i32, buffer: rawptr, bufferSize: c.size_t) -> status_t ---
+	receive_data          :: proc(sender: ^thread_id, buffer: rawptr, bufferSize: c.size_t) -> i32 ---
+	has_data              :: proc(thread: thread_id) -> bool ---
+	snooze                :: proc(amount: bigtime_t) -> status_t ---
+	// FIXME: Find and define those flags.
+	snooze_etc            :: proc(amount: bigtime_t, timeBase: c.int, flags: u32) -> status_t ---
+	snooze_until          :: proc(time: bigtime_t, timeBase: c.int) -> status_t ---
+	_get_thread_info      :: proc(id: thread_id, info: ^thread_info, size: c.size_t) -> status_t ---
+	_get_next_thread_info :: proc(team: team_id, cookie: ^i32, info: ^thread_info, size: c.size_t) -> status_t ---
+	// bridge to the pthread API
+	get_pthread_thread_id :: proc(thread: pthread_t) -> thread_id ---
+}
+
+// Time
+
+foreign libroot {
+	real_time_clock       :: proc() -> c.ulong ---
+	set_real_time_clock   :: proc(secsSinceJan1st1970: c.ulong) ---
+	real_time_clock_usecs :: proc() -> bigtime_t ---
+	// time since booting in microseconds
+	system_time           :: proc() -> bigtime_t ---
+	// time since booting in nanoseconds
+	system_time_nsecs     :: proc() -> nanotime_t ---
+}
+
+// Alarm
+
+alarm_mode :: enum u32 {
+	ONE_SHOT_ABSOLUTE_ALARM	= 1,
+	ONE_SHOT_RELATIVE_ALARM,
+	PERIODIC_ALARM, // "when" specifies the period
+}
+
+foreign libroot {
+	set_alarm :: proc(_when: bigtime_t, mode: alarm_mode) -> bigtime_t ---
+}
+
+// Debugger
+
+foreign libroot {
+	debugger :: proc(message: cstring) ---
+	/*
+		calling this function with a non-zero value will cause your thread
+		to receive signals for any exceptional conditions that occur (i.e.
+		you'll get SIGSEGV for data access exceptions, SIGFPE for floating
+		point errors, SIGILL for illegal instructions, etc).
+
+		to re-enable the default debugger pass a zero.
+	*/
+	disable_debugger :: proc(state: c.int) -> c.int ---
+}
+
+// System information
+
+cpu_info :: struct {
+	active_time:       bigtime_t,
+	enabled:           bool,
+	current_frequency: u64,
+}
+
+system_info :: struct {
+	boot_time:         bigtime_t, // time of boot (usecs since 1/1/1970)
+
+	cpu_count:         u32,       // number of cpus
+
+	max_pages:         u64,       // total # of accessible pages
+	used_pages:        u64,       // # of accessible pages in use
+	cached_pages:      u64,
+	block_cache_pages: u64,
+	ignored_pages:     u64,	      // # of ignored/inaccessible pages
+
+	needed_memory:     u64,
+	free_memory:       u64,
+
+	max_swap_pages:    u64,
+	free_swap_pages:   u64,
+
+	page_faults:       u32,	      // # of page faults
+
+	max_sems:          u32,
+	used_sems:         u32,
+
+	max_ports:         u32,
+	used_ports:        u32,
+
+	max_threads:       u32,
+	used_threads:      u32,
+
+	max_teams:         u32,
+	used_teams:        u32,
+
+	kernel_name:       [FILE_NAME_LENGTH]c.char,
+	kernel_build_date: [OS_NAME_LENGTH]c.char,
+	kernel_build_time: [OS_NAME_LENGTH]c.char,
+
+	kernel_version:    i64,
+	abi:               u32,       // the system API
+}
+
+topology_level_type :: enum c.int {
+	UNKNOWN,
+	ROOT,
+	SMT,
+	CORE,
+	PACKAGE,
+}
+
+cpu_platform :: enum c.int {
+	UNKNOWN,
+	x86,
+	x86_64,
+	PPC,
+	PPC_64,
+	M68K,
+	ARM,
+	ARM_64,
+	ALPHA,
+	MIPS,
+	SH,
+	SPARC,
+	RISC_V,
+}
+
+cpu_vendor :: enum c.int {
+	UNKNOWN,
+	AMD,
+	CYRIX,
+	IDT,
+	INTEL,
+	NATIONAL_SEMICONDUCTOR,
+	RISE,
+	TRANSMETA,
+	VIA,
+	IBM,
+	MOTOROLA,
+	NEC,
+	HYGON,
+	SUN,
+	FUJITSU,
+}
+
+cpu_topology_node_info :: struct {
+	id:            u32,
+	type:          topology_level_type,
+	level:         u32,
+
+	data: struct #raw_union {
+		_root: struct {
+			platform: cpu_platform,
+		},
+		_package: struct {
+			vendor:          cpu_vendor,
+			cache_line_size: u32
+		},
+		_core: struct {
+			model:             u32,
+			default_frequency: u64,
+		},
+	},
+}
+
+// FIXME: Add cpuid_info when bit fields are ready.
+
+foreign libroot {
+	get_system_info       :: proc(info: ^system_info) -> status_t ---
+	_get_cpu_info_etc     :: proc(firstCPU: u32, cpuCount: u32, info: ^cpu_info, size: c.size_t) -> status_t ---
+	get_cpu_topology_info :: proc(topologyInfos: [^]cpu_topology_node_info, topologyInfoCount: ^u32) -> status_t ---
+
+	is_computer_on        :: proc() -> i32 ---
+	is_computer_on_fire   :: proc() -> f64 ---
+}
+
+// Signal.h
+
+SIG_BLOCK   :: 1
+SIG_UNBLOCK :: 2
+SIG_SETMASK :: 3
+
+/*
+ * The list of all defined signals:
+ *
+ * The numbering of signals for Haiku attempts to maintain
+ * some consistency with UN*X conventions so that things
+ * like "kill -9" do what you expect.
+ */
+
+SIGHUP     :: 1  // hangup -- tty is gone!
+SIGINT     :: 2  // interrupt
+SIGQUIT    :: 3  // `quit' special character typed in tty
+SIGILL     :: 4  // illegal instruction
+SIGCHLD    :: 5  // child process exited
+SIGABRT    :: 6  // abort() called, dont' catch
+SIGPIPE    :: 7  // write to a pipe w/no readers
+SIGFPE     :: 8  // floating point exception
+SIGKILL    :: 9  // kill a team (not catchable)
+SIGSTOP    :: 10 // suspend a thread (not catchable)
+SIGSEGV    :: 11 // segmentation violation (read: invalid pointer)
+SIGCONT    :: 12 // continue execution if suspended
+SIGTSTP    :: 13 // `stop' special character typed in tty
+SIGALRM    :: 14 // an alarm has gone off (see alarm())
+SIGTERM    :: 15 // termination requested
+SIGTTIN    :: 16 // read of tty from bg process
+SIGTTOU    :: 17 // write to tty from bg process
+SIGUSR1    :: 18 // app defined signal 1
+SIGUSR2    :: 19 // app defined signal 2
+SIGWINCH   :: 20 // tty window size changed
+SIGKILLTHR :: 21 // be specific: kill just the thread, not team
+SIGTRAP    :: 22 // Trace/breakpoint trap
+SIGPOLL    :: 23 // Pollable event
+SIGPROF    :: 24 // Profiling timer expired
+SIGSYS     :: 25 // Bad system call
+SIGURG     :: 26 // High bandwidth data is available at socket
+SIGVTALRM  :: 27 // Virtual timer expired
+SIGXCPU    :: 28 // CPU time limit exceeded
+SIGXFSZ    :: 29 // File size limit exceeded
+SIGBUS     :: 30 // access to undefined portion of a memory object
+
+sigval :: struct #raw_union {
+	sival_int: c.int,
+	sival_ptr: rawptr,
+}
+
+siginfo_t :: struct {
+	si_signo:  c.int,  // signal number
+	si_code:   c.int,  // signal code
+	si_errno:  c.int,  // if non zero, an error number associated with this signal
+
+	si_pid:    pid_t,  // sending process ID
+	si_uid:    uid_t,  // real user ID of sending process
+	si_addr:   rawptr, // address of faulting instruction
+	si_status: c.int,  // exit value or signal
+	si_band:   c.long, // band event for SIGPOLL
+	si_value:  sigval, // signal value
+}
+
+foreign libroot {
+	// signal set (sigset_t) manipulation
+	sigemptyset  :: proc(set: ^sigset_t) -> c.int ---
+	sigfillset   :: proc(set: ^sigset_t) -> c.int ---
+	sigaddset    :: proc(set: ^sigset_t, _signal: c.int) -> c.int ---
+	sigdelset    :: proc(set: ^sigset_t, _signal: c.int) -> c.int ---
+	sigismember  :: proc(set: ^sigset_t, _signal: c.int) -> c.int ---
+	// querying and waiting for signals
+	sigpending   :: proc(set: ^sigset_t) -> c.int ---
+	sigsuspend   :: proc(mask: ^sigset_t) -> c.int ---
+	sigpause     :: proc(_signal: c.int) -> c.int ---
+	sigwait      :: proc(set: ^sigset_t, _signal: ^c.int) -> c.int ---
+	sigwaitinfo  :: proc(set: ^sigset_t, info: ^siginfo_t) -> c.int ---
+	sigtimedwait :: proc(set: ^sigset_t, info: ^siginfo_t, timeout: ^unix.timespec) -> c.int ---
+
+	send_signal      :: proc(threadID: thread_id, signal: c.uint) -> c.int ---
+	set_signal_stack :: proc(base: rawptr, size: c.size_t) ---
+}
diff --git a/core/sys/haiku/types.odin b/core/sys/haiku/types.odin
new file mode 100644
index 000000000..0440d5a98
--- /dev/null
+++ b/core/sys/haiku/types.odin
@@ -0,0 +1,54 @@
+//+build haiku
+package sys_haiku
+
+import "core:c"
+
+status_t       :: i32
+bigtime_t      :: i64
+nanotime_t     :: i64
+type_code      :: u32
+perform_code   :: u32
+
+phys_addr_t    :: uintptr
+phys_size_t    :: phys_addr_t
+generic_addr_t :: uintptr
+generic_size_t :: generic_addr_t
+
+area_id        :: i32
+port_id        :: i32
+sem_id         :: i32
+team_id        :: i32
+thread_id      :: i32
+
+blkcnt_t       :: i64
+blksize_t      :: i32
+fsblkcnt_t     :: i64
+fsfilcnt_t     :: i64
+off_t          :: i64
+ino_t          :: i64
+cnt_t          :: i32
+dev_t          :: i32
+pid_t          :: i32
+id_t           :: i32
+
+uid_t          :: u32
+gid_t          :: u32
+mode_t         :: u32
+umode_t        :: u32
+nlink_t        :: i32
+
+caddr_t        :: ^c.char
+
+addr_t         :: phys_addr_t
+key_t          :: i32
+
+clockid_t      :: i32
+
+time_t         :: i64 when ODIN_ARCH == .amd64 || ODIN_ARCH == .arm64 else i32
+
+sig_atomic_t   :: c.int
+sigset_t       :: u64
+
+image_id       :: i32
+
+pthread_t      :: rawptr
diff --git a/core/sys/unix/pthread_haiku.odin b/core/sys/unix/pthread_haiku.odin
new file mode 100644
index 000000000..1278f34fe
--- /dev/null
+++ b/core/sys/unix/pthread_haiku.odin
@@ -0,0 +1,71 @@
+package unix
+
+import "core:c"
+
+pthread_t             :: distinct rawptr
+pthread_attr_t        :: distinct rawptr
+pthread_mutex_t       :: distinct rawptr
+pthread_mutexattr_t   :: distinct rawptr
+pthread_cond_t        :: distinct rawptr
+pthread_condattr_t    :: distinct rawptr
+pthread_rwlock_t      :: distinct rawptr
+pthread_rwlockattr_t  :: distinct rawptr
+pthread_barrier_t     :: distinct rawptr
+pthread_barrierattr_t :: distinct rawptr
+pthread_spinlock_t    :: distinct rawptr
+
+pthread_key_t  :: distinct c.int
+pthread_once_t :: struct {
+	state: c.int,
+	mutex: pthread_mutex_t,
+}
+
+PTHREAD_MUTEX_DEFAULT    :: 0
+PTHREAD_MUTEX_NORMAL     :: 1
+PTHREAD_MUTEX_ERRORCHECK :: 2
+PTHREAD_MUTEX_RECURSIVE  :: 3
+
+PTHREAD_DETACHED      :: 0x1
+PTHREAD_SCOPE_SYSTEM  :: 0x2
+PTHREAD_INHERIT_SCHED :: 0x4
+PTHREAD_NOFLOAT       :: 0x8
+
+PTHREAD_CREATE_DETACHED :: PTHREAD_DETACHED
+PTHREAD_CREATE_JOINABLE :: 0
+PTHREAD_SCOPE_PROCESS   :: 0
+PTHREAD_EXPLICIT_SCHED  :: 0
+
+SCHED_FIFO     :: 1
+SCHED_RR       :: 2
+SCHED_SPORADIC :: 3
+SCHED_OTHER    :: 4
+
+sched_param :: struct {
+	sched_priority: c.int,
+}
+
+sem_t :: distinct rawptr
+
+PTHREAD_CANCEL_ENABLE       :: 0
+PTHREAD_CANCEL_DISABLE      :: 1
+PTHREAD_CANCEL_DEFERRED     :: 0
+PTHREAD_CANCEL_ASYNCHRONOUS :: 2
+
+foreign import libc "system:c"
+
+@(default_calling_convention="c")
+foreign libc {
+	sem_open :: proc(name: cstring, flags: c.int) -> ^sem_t ---
+
+	sem_init :: proc(sem: ^sem_t, pshared: c.int, initial_value: c.uint) -> c.int ---
+	sem_destroy :: proc(sem: ^sem_t) -> c.int ---
+	sem_post :: proc(sem: ^sem_t) -> c.int ---
+	sem_wait :: proc(sem: ^sem_t) -> c.int ---
+	sem_trywait :: proc(sem: ^sem_t) -> c.int ---
+	
+	pthread_yield :: proc() ---
+
+	pthread_setcancelstate :: proc (state: c.int, old_state: ^c.int) -> c.int ---
+	pthread_setcanceltype  :: proc (type:  c.int, old_type:  ^c.int) -> c.int ---
+	pthread_cancel         :: proc (thread: pthread_t) -> c.int ---
+}
diff --git a/core/sys/unix/pthread_unix.odin b/core/sys/unix/pthread_unix.odin
index 8bf397647..4fe3c8dfa 100644
--- a/core/sys/unix/pthread_unix.odin
+++ b/core/sys/unix/pthread_unix.odin
@@ -1,4 +1,4 @@
-//+build linux, darwin, freebsd, openbsd
+//+build linux, darwin, freebsd, openbsd, haiku
 package unix
 
 foreign import "system:pthread"
@@ -16,6 +16,8 @@ foreign pthread {
 	// retval is a pointer to a location to put the return value of the thread proc.
 	pthread_join :: proc(t: pthread_t, retval: ^rawptr) -> c.int ---
 
+	pthread_kill :: proc(t: pthread_t, sig: c.int) -> c.int ---
+
 	pthread_self :: proc() -> pthread_t ---
 
 	pthread_equal :: proc(a, b: pthread_t) -> b32 ---
@@ -31,15 +33,9 @@ foreign pthread {
 	pthread_attr_getschedparam :: proc(attrs: ^pthread_attr_t, param: ^sched_param) -> c.int ---
 	pthread_attr_setschedparam :: proc(attrs: ^pthread_attr_t, param: ^sched_param) -> c.int ---
 
-	pthread_attr_getschedpolicy :: proc(t: ^pthread_attr_t, policy: ^c.int) -> c.int ---
-	pthread_attr_setschedpolicy :: proc(t: ^pthread_attr_t, policy: c.int) -> c.int ---
-
 	// states: PTHREAD_CREATE_DETACHED, PTHREAD_CREATE_JOINABLE
 	pthread_attr_setdetachstate :: proc(attrs: ^pthread_attr_t, detach_state: c.int) -> c.int ---
-
-	// scheds: PTHREAD_INHERIT_SCHED, PTHREAD_EXPLICIT_SCHED
-	pthread_attr_setinheritsched :: proc(attrs: ^pthread_attr_t, sched: c.int) -> c.int ---
-
+	
 	// NOTE(tetra, 2019-11-06): WARNING: Different systems have different alignment requirements.
 	// For maximum usefulness, use the OS's page size.
 	// ALSO VERY MAJOR WARNING: `stack_ptr` must be the LAST byte of the stack on systems
@@ -52,8 +48,20 @@ foreign pthread {
 	pthread_attr_setstack :: proc(attrs: ^pthread_attr_t, stack_ptr: rawptr, stack_size: u64) -> c.int ---
 	pthread_attr_getstack :: proc(attrs: ^pthread_attr_t, stack_ptr: ^rawptr, stack_size: ^u64) -> c.int ---
 
-	sched_yield :: proc() -> c.int ---
+	pthread_sigmask :: proc(how: c.int, set: rawptr, oldset: rawptr) -> c.int ---
 
+	sched_yield :: proc() -> c.int ---
+}
+
+// NOTE: Unimplemented in Haiku.
+when ODIN_OS != .Haiku {
+	foreign pthread {
+		// scheds: PTHREAD_INHERIT_SCHED, PTHREAD_EXPLICIT_SCHED
+		pthread_attr_setinheritsched :: proc(attrs: ^pthread_attr_t, sched: c.int) -> c.int ---
+
+		pthread_attr_getschedpolicy :: proc(t: ^pthread_attr_t, policy: ^c.int) -> c.int ---
+		pthread_attr_setschedpolicy :: proc(t: ^pthread_attr_t, policy: c.int) -> c.int ---
+	}
 }
 
 @(default_calling_convention="c")
diff --git a/core/sys/unix/time_unix.odin b/core/sys/unix/time_unix.odin
index 108067dd4..088dc378b 100644
--- a/core/sys/unix/time_unix.odin
+++ b/core/sys/unix/time_unix.odin
@@ -1,4 +1,4 @@
-//+build linux, darwin, freebsd, openbsd
+//+build linux, darwin, freebsd, openbsd, haiku
 package unix
 
 when ODIN_OS == .Darwin {
diff --git a/core/thread/thread_unix.odin b/core/thread/thread_unix.odin
index 19e421646..c75710873 100644
--- a/core/thread/thread_unix.odin
+++ b/core/thread/thread_unix.odin
@@ -1,4 +1,4 @@
-// +build linux, darwin, freebsd, openbsd
+// +build linux, darwin, freebsd, openbsd, haiku
 // +private
 package thread
 
@@ -78,7 +78,9 @@ _create :: proc(procedure: Thread_Proc, priority: Thread_Priority) -> ^Thread {
 
 	// NOTE(tetra, 2019-11-01): These only fail if their argument is invalid.
 	assert(unix.pthread_attr_setdetachstate(&attrs, unix.PTHREAD_CREATE_JOINABLE) == 0)
-	assert(unix.pthread_attr_setinheritsched(&attrs, unix.PTHREAD_EXPLICIT_SCHED) == 0)
+	when ODIN_OS != .Haiku {
+		assert(unix.pthread_attr_setinheritsched(&attrs, unix.PTHREAD_EXPLICIT_SCHED) == 0)
+	}
 
 	thread := new(Thread)
 	if thread == nil {
@@ -88,8 +90,11 @@ _create :: proc(procedure: Thread_Proc, priority: Thread_Priority) -> ^Thread {
 
 	// Set thread priority.
 	policy: i32
-	res := unix.pthread_attr_getschedpolicy(&attrs, &policy)
-	assert(res == 0)
+	res: i32
+	when ODIN_OS != .Haiku {
+		res = unix.pthread_attr_getschedpolicy(&attrs, &policy)
+		assert(res == 0)
+	}
 	params: unix.sched_param
 	res = unix.pthread_attr_getschedparam(&attrs, &params)
 	assert(res == 0)
diff --git a/core/time/time_unix.odin b/core/time/time_unix.odin
index ba0d91527..1c46b5994 100644
--- a/core/time/time_unix.odin
+++ b/core/time/time_unix.odin
@@ -1,5 +1,5 @@
 //+private
-//+build linux, darwin, freebsd, openbsd
+//+build linux, darwin, freebsd, openbsd, haiku
 package time
 
 import "core:sys/unix"
diff --git a/src/build_settings.cpp b/src/build_settings.cpp
index 0bcb9f298..fdaa971f1 100644
--- a/src/build_settings.cpp
+++ b/src/build_settings.cpp
@@ -18,6 +18,7 @@ enum TargetOsKind : u16 {
 	TargetOs_essence,
 	TargetOs_freebsd,
 	TargetOs_openbsd,
+	TargetOs_haiku,
 	
 	TargetOs_wasi,
 	TargetOs_js,
@@ -78,6 +79,7 @@ gb_global String target_os_names[TargetOs_COUNT] = {
 	str_lit("essence"),
 	str_lit("freebsd"),
 	str_lit("openbsd"),
+	str_lit("haiku"),
 	
 	str_lit("wasi"),
 	str_lit("js"),
@@ -542,6 +544,13 @@ gb_global TargetMetrics target_openbsd_amd64 = {
 	str_lit("e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"),
 };
 
+gb_global TargetMetrics target_haiku_amd64 = {
+	TargetOs_haiku,
+	TargetArch_amd64,
+	8, 8, 8, 16,
+	str_lit("x86_64-unknown-haiku"),
+};
+
 gb_global TargetMetrics target_essence_amd64 = {
 	TargetOs_essence,
 	TargetArch_amd64,
@@ -641,6 +650,7 @@ gb_global NamedTargetMetrics named_targets[] = {
 	{ str_lit("freebsd_amd64"),       &target_freebsd_amd64  },
 
 	{ str_lit("openbsd_amd64"),       &target_openbsd_amd64  },
+	{ str_lit("haiku_amd64"),         &target_haiku_amd64    },
 
 	{ str_lit("freestanding_wasm32"), &target_freestanding_wasm32 },
 	{ str_lit("wasi_wasm32"),         &target_wasi_wasm32 },
@@ -872,6 +882,58 @@ gb_internal String internal_odin_root_dir(void) {
 	return path;
 }
 
+#elif defined(GB_SYSTEM_HAIKU)
+
+#include <FindDirectory.h>
+
+gb_internal String path_to_fullpath(gbAllocator a, String s, bool *ok_);
+
+gb_internal String internal_odin_root_dir(void) {
+	String path = global_module_path;
+	isize len, i;
+	u8 *text;
+
+	if (global_module_path_set) {
+		return global_module_path;
+	}
+
+	auto path_buf = array_make<char>(heap_allocator(), 300);
+	defer (array_free(&path_buf));
+
+	len = 0;
+	for (;;) {
+		u32 sz = path_buf.count;
+		int res = find_path(B_APP_IMAGE_SYMBOL, B_FIND_PATH_IMAGE_PATH, nullptr, &path_buf[0], sz);
+		if(res == B_OK) {
+			len = sz;
+			break;
+		} else {
+			array_resize(&path_buf, sz + 1);
+		}
+	}
+
+	mutex_lock(&string_buffer_mutex);
+	defer (mutex_unlock(&string_buffer_mutex));
+
+	text = gb_alloc_array(permanent_allocator(), u8, len + 1);
+	gb_memmove(text, &path_buf[0], len);
+
+	path = path_to_fullpath(heap_allocator(), make_string(text, len), nullptr);
+
+	for (i = path.len-1; i >= 0; i--) {
+		u8 c = path[i];
+		if (c == '/' || c == '\\') {
+			break;
+		}
+		path.len--;
+	}
+
+	global_module_path = path;
+	global_module_path_set = true;
+
+	return path;
+}
+
 #elif defined(GB_SYSTEM_OSX)
 
 #include <mach-o/dyld.h>
@@ -888,6 +950,7 @@ gb_internal String internal_odin_root_dir(void) {
 	}
 
 	auto path_buf = array_make<char>(heap_allocator(), 300);
+	defer (array_free(&path_buf));
 
 	len = 0;
 	for (;;) {
@@ -920,9 +983,6 @@ gb_internal String internal_odin_root_dir(void) {
 	global_module_path = path;
 	global_module_path_set = true;
 
-
-	// array_free(&path_buf);
-
 	return path;
 }
 #else
@@ -1301,6 +1361,8 @@ gb_internal void init_build_context(TargetMetrics *cross_target, Subtarget subta
 			metrics = &target_freebsd_amd64;
 		#elif defined(GB_SYSTEM_OPENBSD)
 			metrics = &target_openbsd_amd64;
+		#elif defined(GB_SYSTEM_HAIKU)
+			metrics = &target_haiku_amd64;
 		#elif defined(GB_CPU_ARM)
 			metrics = &target_linux_arm64;
 		#else
@@ -1405,6 +1467,9 @@ gb_internal void init_build_context(TargetMetrics *cross_target, Subtarget subta
 		case TargetOs_openbsd:
 			bc->link_flags = str_lit("-arch x86-64 ");
 			break;
+		case TargetOs_haiku:
+			bc->link_flags = str_lit("-arch x86-64 ");
+			break;
 		}
 	} else if (bc->metrics.arch == TargetArch_i386) {
 		switch (bc->metrics.os) {
diff --git a/src/check_builtin.cpp b/src/check_builtin.cpp
index c85fb28d6..e1b1cd693 100644
--- a/src/check_builtin.cpp
+++ b/src/check_builtin.cpp
@@ -4928,6 +4928,7 @@ gb_internal bool check_builtin_procedure(CheckerContext *c, Operand *operand, As
 			case TargetOs_essence:
 			case TargetOs_freebsd:
 			case TargetOs_openbsd:
+			case TargetOs_haiku:
 				switch (build_context.metrics.arch) {
 				case TargetArch_i386:
 				case TargetArch_amd64:
diff --git a/src/checker.cpp b/src/checker.cpp
index 5827fc695..72c0ae574 100644
--- a/src/checker.cpp
+++ b/src/checker.cpp
@@ -1010,6 +1010,7 @@ gb_internal void init_universal(void) {
 			{"Linux",        TargetOs_linux},
 			{"Essence",      TargetOs_essence},
 			{"FreeBSD",      TargetOs_freebsd},
+			{"Haiku",        TargetOs_haiku},
 			{"OpenBSD",      TargetOs_openbsd},
 			{"WASI",         TargetOs_wasi},
 			{"JS",           TargetOs_js},
diff --git a/src/gb/gb.h b/src/gb/gb.h
index 93d250f21..702647121 100644
--- a/src/gb/gb.h
+++ b/src/gb/gb.h
@@ -83,6 +83,10 @@ extern "C" {
 		#ifndef GB_SYSTEM_OPENBSD
 		#define GB_SYSTEM_OPENBSD 1
 		#endif
+	#elif defined(__HAIKU__) || defined(__haiku__)
+		#ifndef GB_SYSTEM_HAIKU
+		#define GB_SYSTEM_HAIKU 1
+		#endif
 	#else
 		#error This UNIX operating system is not supported
 	#endif
@@ -206,7 +210,7 @@ extern "C" {
 	#endif
 	#include <stdlib.h> // NOTE(bill): malloc on linux
 	#include <sys/mman.h>
-	#if !defined(GB_SYSTEM_OSX) && !defined(__FreeBSD__) && !defined(__OpenBSD__)
+	#if !defined(GB_SYSTEM_OSX) && !defined(__FreeBSD__) && !defined(__OpenBSD__) && !defined(__HAIKU__)
 		#include <sys/sendfile.h>
 	#endif
 	#include <sys/stat.h>
@@ -247,6 +251,13 @@ extern "C" {
 	#include <pthread_np.h>
 	#define lseek64 lseek
 #endif
+
+#if defined(GB_SYSTEM_HAIKU)
+	#include <stdio.h>
+	#include <pthread.h>
+	#include <kernel/OS.h>
+	#define lseek64 lseek
+#endif
     
 #if defined(GB_SYSTEM_UNIX)
 	#include <semaphore.h>
@@ -801,6 +812,13 @@ typedef struct gbAffinity {
 	isize thread_count;
 	isize threads_per_core;
 } gbAffinity;
+#elif defined(GB_SYSTEM_HAIKU)
+typedef struct gbAffinity {
+	b32   is_accurate;
+	isize core_count;
+	isize thread_count;
+	isize threads_per_core;
+} gbAffinity;
 #else
 #error TODO(bill): Unknown system
 #endif
@@ -2984,6 +3002,8 @@ gb_inline u32 gb_thread_current_id(void) {
 	__asm__("mov %%fs:0x10,%0" : "=r"(thread_id));
 #elif defined(GB_SYSTEM_LINUX)
 	thread_id = gettid();
+#elif defined(GB_SYSTEM_HAIKU)
+	thread_id = find_thread(NULL);
 #else
 	#error Unsupported architecture for gb_thread_current_id()
 #endif
@@ -3184,7 +3204,9 @@ b32 gb_affinity_set(gbAffinity *a, isize core, isize thread_index) {
 	//info.affinity_tag = cast(integer_t)index;
 	//result = thread_policy_set(thread, THREAD_AFFINITY_POLICY, cast(thread_policy_t)&info, THREAD_AFFINITY_POLICY_COUNT);
 
+#if !defined(GB_SYSTEM_HAIKU)
 	result = pthread_setaffinity_np(thread, sizeof(cpuset_t), &mn);
+#endif
 	return result == 0;
 }
 
@@ -3236,6 +3258,29 @@ b32 gb_affinity_set(gbAffinity *a, isize core, isize thread_index) {
 	return true;
 }
 
+isize gb_affinity_thread_count_for_core(gbAffinity *a, isize core) {
+	GB_ASSERT(0 <= core && core < a->core_count);
+	return a->threads_per_core;
+}
+#elif defined(GB_SYSTEM_HAIKU)
+#include <unistd.h>
+
+void gb_affinity_init(gbAffinity *a) {
+	a->core_count       = sysconf(_SC_NPROCESSORS_ONLN);
+	a->threads_per_core = 1;
+	a->is_accurate      = a->core_count > 0;
+	a->core_count       = a->is_accurate ? a->core_count : 1;
+	a->thread_count     = a->core_count;
+}
+
+void gb_affinity_destroy(gbAffinity *a) {
+	gb_unused(a);
+}
+
+b32 gb_affinity_set(gbAffinity *a, isize core, isize thread_index) {
+	return true;
+}
+
 isize gb_affinity_thread_count_for_core(gbAffinity *a, isize core) {
 	GB_ASSERT(0 <= core && core < a->core_count);
 	return a->threads_per_core;
@@ -5457,7 +5502,7 @@ gb_inline b32 gb_file_copy(char const *existing_filename, char const *new_filena
 		}
 	}
 	
-	gb_free(buf);
+	gb_mfree(buf);
 	close(new_fd);
 	close(existing_fd);
 
diff --git a/src/linker.cpp b/src/linker.cpp
index 0144c4aaf..0cdeaf8d9 100644
--- a/src/linker.cpp
+++ b/src/linker.cpp
@@ -474,8 +474,8 @@ gb_internal i32 linker_stage(LinkerData *gen) {
 					link_settings = gb_string_appendc(link_settings, "-Wl,-fini,'_odin_exit_point' ");
 				}
 
-			} else if (build_context.metrics.os != TargetOs_openbsd) {
-				// OpenBSD defaults to PIE executable. do not pass -no-pie for it.
+			} else if (build_context.metrics.os != TargetOs_openbsd && build_context.metrics.os != TargetOs_haiku) {
+				// OpenBSD and Haiku default to PIE executable. do not pass -no-pie for it.
 				link_settings = gb_string_appendc(link_settings, "-no-pie ");
 			}
 
diff --git a/src/llvm_backend.cpp b/src/llvm_backend.cpp
index efba19f23..ca4341525 100644
--- a/src/llvm_backend.cpp
+++ b/src/llvm_backend.cpp
@@ -2564,8 +2564,8 @@ gb_internal bool lb_generate_code(lbGenerator *gen) {
 
 	switch (build_context.reloc_mode) {
 	case RelocMode_Default:
-		if (build_context.metrics.os == TargetOs_openbsd) {
-			// Always use PIC for OpenBSD: it defaults to PIE
+		if (build_context.metrics.os == TargetOs_openbsd || build_context.metrics.os == TargetOs_haiku) {
+			// Always use PIC for OpenBSD and Haiku: they default to PIE
 			reloc_mode = LLVMRelocPIC;
 		}
 		break;
diff --git a/src/path.cpp b/src/path.cpp
index de80c9def..742bba7f8 100644
--- a/src/path.cpp
+++ b/src/path.cpp
@@ -1,461 +1,461 @@
-/*
-	Path handling utilities.
-*/
-#if !defined(GB_SYSTEM_WINDOWS)
-#include <unistd.h>
-#endif
-
-gb_internal String remove_extension_from_path(String const &s) {
-	if (s.len != 0 && s.text[s.len-1] == '.') {
-		return s;
-	}
-	for (isize i = s.len-1; i >= 0; i--) {
-		if (s[i] == '.') {
-			return substring(s, 0, i);
-		}
-	}
-	return s;
-}
-
-gb_internal String remove_directory_from_path(String const &s) {
-	isize len = 0;
-	for (isize i = s.len-1; i >= 0; i--) {
-		if (s[i] == '/' ||
-		    s[i] == '\\') {
-			break;
-		}
-		len += 1;
-	}
-	return substring(s, s.len-len, s.len);
-}
-
-
-// NOTE(Mark Naughton): getcwd as String
-#if !defined(GB_SYSTEM_WINDOWS)
-gb_internal String get_current_directory(void) {
-	char cwd[256];
-	getcwd(cwd, 256);
-
-	return make_string_c(cwd);
-}
-
-#else
-gb_internal String get_current_directory(void) {
-	gbAllocator a = heap_allocator();
-
-	wchar_t cwd[256];
-	GetCurrentDirectoryW(256, cwd);
-
-	String16 wstr = make_string16_c(cwd);
-
-	return string16_to_string(a, wstr);
-}
-#endif
-
-gb_internal bool path_is_directory(String path);
-
-gb_internal String directory_from_path(String const &s) {
-	if (path_is_directory(s)) {
-		return s;
-	}
-
-	isize i = s.len-1;
-	for (; i >= 0; i--) {
-		if (s[i] == '/' ||
-		    s[i] == '\\') {
-			break;
-		}
-	}
-	if (i >= 0) {
-		return substring(s, 0, i);	
-	}
-	return substring(s, 0, 0);
-}
-
-#if defined(GB_SYSTEM_WINDOWS)
-	gb_internal bool path_is_directory(String path) {
-		gbAllocator a = heap_allocator();
-		String16 wstr = string_to_string16(a, path);
-		defer (gb_free(a, wstr.text));
-
-		i32 attribs = GetFileAttributesW(wstr.text);
-		if (attribs < 0) return false;
-
-		return (attribs & FILE_ATTRIBUTE_DIRECTORY) != 0;
-	}
-
-#else
-	gb_internal bool path_is_directory(String path) {
-		gbAllocator a = heap_allocator();
-		char *copy = cast(char *)copy_string(a, path).text;
-		defer (gb_free(a, copy));
-
-		struct stat s;
-		if (stat(copy, &s) == 0) {
-			return (s.st_mode & S_IFDIR) != 0;
-		}
-		return false;
-	}
-#endif
-
-
-gb_internal String path_to_full_path(gbAllocator a, String path) {
-	gbAllocator ha = heap_allocator();
-	char *path_c = gb_alloc_str_len(ha, cast(char *)path.text, path.len);
-	defer (gb_free(ha, path_c));
-
-	char *fullpath = gb_path_get_full_name(a, path_c);
-	String res = string_trim_whitespace(make_string_c(fullpath));
-#if defined(GB_SYSTEM_WINDOWS)
-	for (isize i = 0; i < res.len; i++) {
-		if (res.text[i] == '\\') {
-			res.text[i] = '/';
-		}
-	}
-#endif
-	return copy_string(a, res);
-}
-
-struct Path {
-	String basename;
-	String name;
-	String ext;
-};
-
-// NOTE(Jeroen): Naively turns a Path into a string.
-gb_internal String path_to_string(gbAllocator a, Path path) {
-	if (path.basename.len + path.name.len + path.ext.len == 0) {
-		return make_string(nullptr, 0);
-	}
-
-	isize len = path.basename.len + 1 + path.name.len + 1;
-	if (path.ext.len > 0) {
-		 len += path.ext.len + 1;
-	}
-
-	u8 *str = gb_alloc_array(a, u8, len);
-
-	isize i = 0;
-	gb_memmove(str+i, path.basename.text, path.basename.len); i += path.basename.len;
-	
-	gb_memmove(str+i, "/", 1);                                i += 1;
-	
-	gb_memmove(str+i, path.name.text,     path.name.len);     i += path.name.len;
-	if (path.ext.len > 0) {
-		gb_memmove(str+i, ".", 1);                            i += 1;
-		gb_memmove(str+i, path.ext.text,  path.ext.len);      i += path.ext.len;
-	}
-	str[i] = 0;
-
-	String res = make_string(str, i);
-	res        = string_trim_whitespace(res);
-	return res;
-}
-
-// NOTE(Jeroen): Naively turns a Path into a string, then normalizes it using `path_to_full_path`.
-gb_internal String path_to_full_path(gbAllocator a, Path path) {
-	String temp = path_to_string(heap_allocator(), path);
-	defer (gb_free(heap_allocator(), temp.text));
-
-	return path_to_full_path(a, temp);
-}
-
-// NOTE(Jeroen): Takes a path like "odin" or "W:\Odin", turns it into a full path,
-// and then breaks it into its components to make a Path.
-gb_internal Path path_from_string(gbAllocator a, String const &path) {
-	Path res = {};
-
-	if (path.len == 0) return res;
-
-	String fullpath = path_to_full_path(a, path);
-	defer (gb_free(heap_allocator(), fullpath.text));
-
-	res.basename = directory_from_path(fullpath);	
-	res.basename = copy_string(a, res.basename);
-
-	if (path_is_directory(fullpath)) {
-		// It's a directory. We don't need to tinker with the name and extension.
-		// It could have a superfluous trailing `/`. Remove it if so.
-		if (res.basename.len > 0 && res.basename.text[res.basename.len - 1] == '/') {
-			res.basename.len--;
-		}
-		return res;
-	}
-
-	// Note(Dragos): Is the copy_string required if it's a substring?
-	isize name_start = (res.basename.len > 0) ? res.basename.len + 1 : res.basename.len;
-	res.name         = substring(fullpath, name_start, fullpath.len);
-	res.name         = remove_extension_from_path(res.name);
-	res.name         = copy_string(a, res.name);
-
-	res.ext          = path_extension(fullpath, false); // false says not to include the dot.
-	res.ext          = copy_string(a, res.ext);
-	return res;
-}
-
-// NOTE(Jeroen): Takes a path String and returns the last path element.
-gb_internal String last_path_element(String const &path) {
-	isize count = 0;
-	u8 * start = (u8 *)(&path.text[path.len - 1]);
-	for (isize length = path.len; length > 0 && path.text[length - 1] != '/'; length--) {
-		count++;
-		start--;
-	}
-	if (count > 0) {
-		start++; // Advance past the `/` and return the substring.
-		String res = make_string(start, count);
-		return res;
-	}
-	// Must be a root path like `/` or `C:/`, return empty String.
-	return STR_LIT("");
-}
-
-gb_internal bool path_is_directory(Path path) {
-	String path_string = path_to_full_path(heap_allocator(), path);
-	defer (gb_free(heap_allocator(), path_string.text));
-
-	return path_is_directory(path_string);
-}
-
-struct FileInfo {
-	String name;
-	String fullpath;
-	i64    size;
-	bool   is_dir;
-};
-
-enum ReadDirectoryError {
-	ReadDirectory_None,
-
-	ReadDirectory_InvalidPath,
-	ReadDirectory_NotExists,
-	ReadDirectory_Permission,
-	ReadDirectory_NotDir,
-	ReadDirectory_Empty,
-	ReadDirectory_Unknown,
-
-	ReadDirectory_COUNT,
-};
-
-gb_internal i64 get_file_size(String path) {
-	char *c_str = alloc_cstring(heap_allocator(), path);
-	defer (gb_free(heap_allocator(), c_str));
-
-	gbFile f = {};
-	gbFileError err = gb_file_open(&f, c_str);
-	defer (gb_file_close(&f));
-	if (err != gbFileError_None) {
-		return -1;
-	}
-	return gb_file_size(&f);
-}
-
-
-#if defined(GB_SYSTEM_WINDOWS)
-gb_internal ReadDirectoryError read_directory(String path, Array<FileInfo> *fi) {
-	GB_ASSERT(fi != nullptr);
-
-
-	while (path.len > 0) {
-		Rune end = path[path.len-1];
-		if (end == '/') {
-			path.len -= 1;
-		} else if (end == '\\') {
-			path.len -= 1;
-		} else {
-			break;
-		}
-	}
-
-	if (path.len == 0) {
-		return ReadDirectory_InvalidPath;
-	}
-	{
-		char *c_str = alloc_cstring(temporary_allocator(), path);
-		gbFile f = {};
-		gbFileError file_err = gb_file_open(&f, c_str);
-		defer (gb_file_close(&f));
-
-		switch (file_err) {
-		case gbFileError_Invalid:    return ReadDirectory_InvalidPath;
-		case gbFileError_NotExists:  return ReadDirectory_NotExists;
-		// case gbFileError_Permission: return ReadDirectory_Permission;
-		}
-	}
-
-	if (!path_is_directory(path)) {
-		return ReadDirectory_NotDir;
-	}
-
-
-	gbAllocator a = heap_allocator();
-	char *new_path = gb_alloc_array(a, char, path.len+3);
-	defer (gb_free(a, new_path));
-
-	gb_memmove(new_path, path.text, path.len);
-	gb_memmove(new_path+path.len, "/*", 2);
-	new_path[path.len+2] = 0;
-
-	String np = make_string(cast(u8 *)new_path, path.len+2);
-	String16 wstr = string_to_string16(a, np);
-	defer (gb_free(a, wstr.text));
-
-	WIN32_FIND_DATAW file_data = {};
-	HANDLE find_file = FindFirstFileW(wstr.text, &file_data);
-	if (find_file == INVALID_HANDLE_VALUE) {
-		return ReadDirectory_Unknown;
-	}
-	defer (FindClose(find_file));
-
-	array_init(fi, a, 0, 100);
-
-	do {
-		wchar_t *filename_w = file_data.cFileName;
-		u64 size = cast(u64)file_data.nFileSizeLow;
-		size |= (cast(u64)file_data.nFileSizeHigh) << 32;
-		String name = string16_to_string(a, make_string16_c(filename_w));
-		if (name == "." || name == "..") {
-			gb_free(a, name.text);
-			continue;
-		}
-
-		String filepath = {};
-		filepath.len = path.len+1+name.len;
-		filepath.text = gb_alloc_array(a, u8, filepath.len+1);
-		defer (gb_free(a, filepath.text));
-		gb_memmove(filepath.text, path.text, path.len);
-		gb_memmove(filepath.text+path.len, "/", 1);
-		gb_memmove(filepath.text+path.len+1, name.text, name.len);
-
-		FileInfo info = {};
-		info.name = name;
-		info.fullpath = path_to_full_path(a, filepath);
-		info.size = cast(i64)size;
-		info.is_dir = (file_data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) != 0;
-		array_add(fi, info);
-	} while (FindNextFileW(find_file, &file_data));
-
-	if (fi->count == 0) {
-		return ReadDirectory_Empty;
-	}
-
-	return ReadDirectory_None;
-}
-#elif defined(GB_SYSTEM_LINUX) || defined(GB_SYSTEM_OSX) || defined(GB_SYSTEM_FREEBSD) || defined(GB_SYSTEM_OPENBSD)
-
-#include <dirent.h>
-
-gb_internal ReadDirectoryError read_directory(String path, Array<FileInfo> *fi) {
-	GB_ASSERT(fi != nullptr);
-
-	gbAllocator a = heap_allocator();
-
-	char *c_path = alloc_cstring(a, path);
-	defer (gb_free(a, c_path));
-
-	DIR *dir = opendir(c_path);
-	if (!dir) {
-		switch (errno) {
-		case ENOENT:
-			return ReadDirectory_NotExists;
-		case EACCES:
-			return ReadDirectory_Permission;
-		case ENOTDIR:
-			return ReadDirectory_NotDir;
-		default:
-			// ENOMEM: out of memory
-			// EMFILE: per-process limit on open fds reached
-			// ENFILE: system-wide limit on total open files reached
-			return ReadDirectory_Unknown;
-		}
-		GB_PANIC("unreachable");
-	}
-
-	array_init(fi, a, 0, 100);
-
-	for (;;) {
-		struct dirent *entry = readdir(dir);
-		if (entry == nullptr) {
-			break;
-		}
-
-		String name = make_string_c(entry->d_name);
-		if (name == "." || name == "..") {
-			continue;
-		}
-
-		String filepath = {};
-		filepath.len = path.len+1+name.len;
-		filepath.text = gb_alloc_array(a, u8, filepath.len+1);
-		defer (gb_free(a, filepath.text));
-		gb_memmove(filepath.text, path.text, path.len);
-		gb_memmove(filepath.text+path.len, "/", 1);
-		gb_memmove(filepath.text+path.len+1, name.text, name.len);
-		filepath.text[filepath.len] = 0;
-
-
-		struct stat dir_stat = {};
-
-		if (stat((char *)filepath.text, &dir_stat)) {
-			continue;
-		}
-
-		if (S_ISDIR(dir_stat.st_mode)) {
-			continue;
-		}
-
-		i64 size = dir_stat.st_size;
-
-		FileInfo info = {};
-		info.name = name;
-		info.fullpath = path_to_full_path(a, filepath);
-		info.size = size;
-		array_add(fi, info);
-	}
-
-	if (fi->count == 0) {
-		return ReadDirectory_Empty;
-	}
-
-	return ReadDirectory_None;
-}
-
-
-#else
-#error Implement read_directory
-#endif
-
-#if !defined(GB_SYSTEM_WINDOWS)
-gb_internal bool write_directory(String path) {
-	char const *pathname = (char *) path.text;
-
-	if (access(pathname, W_OK) < 0) {
-		return false;
-	}
-
-	return true;
-}
-#else
-gb_internal bool write_directory(String path) {
-	String16 wstr = string_to_string16(heap_allocator(), path);
-	LPCWSTR wdirectory_name = wstr.text;
-
-	HANDLE directory = CreateFileW(wdirectory_name,
-			GENERIC_WRITE,
-			0,
-			NULL,
-			OPEN_EXISTING,
-			FILE_FLAG_BACKUP_SEMANTICS,
-			NULL);
-
-	if (directory == INVALID_HANDLE_VALUE) {
-		DWORD error_code = GetLastError();
-		if (error_code == ERROR_ACCESS_DENIED) {
-			return false;
-		}
-	}
-
-	CloseHandle(directory);
-	return true;
-}
-#endif
+/*
+	Path handling utilities.
+*/
+#if !defined(GB_SYSTEM_WINDOWS)
+#include <unistd.h>
+#endif
+
+gb_internal String remove_extension_from_path(String const &s) {
+	if (s.len != 0 && s.text[s.len-1] == '.') {
+		return s;
+	}
+	for (isize i = s.len-1; i >= 0; i--) {
+		if (s[i] == '.') {
+			return substring(s, 0, i);
+		}
+	}
+	return s;
+}
+
+gb_internal String remove_directory_from_path(String const &s) {
+	isize len = 0;
+	for (isize i = s.len-1; i >= 0; i--) {
+		if (s[i] == '/' ||
+		    s[i] == '\\') {
+			break;
+		}
+		len += 1;
+	}
+	return substring(s, s.len-len, s.len);
+}
+
+
+// NOTE(Mark Naughton): getcwd as String
+#if !defined(GB_SYSTEM_WINDOWS)
+gb_internal String get_current_directory(void) {
+	char cwd[256];
+	getcwd(cwd, 256);
+
+	return make_string_c(cwd);
+}
+
+#else
+gb_internal String get_current_directory(void) {
+	gbAllocator a = heap_allocator();
+
+	wchar_t cwd[256];
+	GetCurrentDirectoryW(256, cwd);
+
+	String16 wstr = make_string16_c(cwd);
+
+	return string16_to_string(a, wstr);
+}
+#endif
+
+gb_internal bool path_is_directory(String path);
+
+gb_internal String directory_from_path(String const &s) {
+	if (path_is_directory(s)) {
+		return s;
+	}
+
+	isize i = s.len-1;
+	for (; i >= 0; i--) {
+		if (s[i] == '/' ||
+		    s[i] == '\\') {
+			break;
+		}
+	}
+	if (i >= 0) {
+		return substring(s, 0, i);	
+	}
+	return substring(s, 0, 0);
+}
+
+#if defined(GB_SYSTEM_WINDOWS)
+	gb_internal bool path_is_directory(String path) {
+		gbAllocator a = heap_allocator();
+		String16 wstr = string_to_string16(a, path);
+		defer (gb_free(a, wstr.text));
+
+		i32 attribs = GetFileAttributesW(wstr.text);
+		if (attribs < 0) return false;
+
+		return (attribs & FILE_ATTRIBUTE_DIRECTORY) != 0;
+	}
+
+#else
+	gb_internal bool path_is_directory(String path) {
+		gbAllocator a = heap_allocator();
+		char *copy = cast(char *)copy_string(a, path).text;
+		defer (gb_free(a, copy));
+
+		struct stat s;
+		if (stat(copy, &s) == 0) {
+			return (s.st_mode & S_IFDIR) != 0;
+		}
+		return false;
+	}
+#endif
+
+
+gb_internal String path_to_full_path(gbAllocator a, String path) {
+	gbAllocator ha = heap_allocator();
+	char *path_c = gb_alloc_str_len(ha, cast(char *)path.text, path.len);
+	defer (gb_free(ha, path_c));
+
+	char *fullpath = gb_path_get_full_name(a, path_c);
+	String res = string_trim_whitespace(make_string_c(fullpath));
+#if defined(GB_SYSTEM_WINDOWS)
+	for (isize i = 0; i < res.len; i++) {
+		if (res.text[i] == '\\') {
+			res.text[i] = '/';
+		}
+	}
+#endif
+	return copy_string(a, res);
+}
+
+struct Path {
+	String basename;
+	String name;
+	String ext;
+};
+
+// NOTE(Jeroen): Naively turns a Path into a string.
+gb_internal String path_to_string(gbAllocator a, Path path) {
+	if (path.basename.len + path.name.len + path.ext.len == 0) {
+		return make_string(nullptr, 0);
+	}
+
+	isize len = path.basename.len + 1 + path.name.len + 1;
+	if (path.ext.len > 0) {
+		 len += path.ext.len + 1;
+	}
+
+	u8 *str = gb_alloc_array(a, u8, len);
+
+	isize i = 0;
+	gb_memmove(str+i, path.basename.text, path.basename.len); i += path.basename.len;
+	
+	gb_memmove(str+i, "/", 1);                                i += 1;
+	
+	gb_memmove(str+i, path.name.text,     path.name.len);     i += path.name.len;
+	if (path.ext.len > 0) {
+		gb_memmove(str+i, ".", 1);                            i += 1;
+		gb_memmove(str+i, path.ext.text,  path.ext.len);      i += path.ext.len;
+	}
+	str[i] = 0;
+
+	String res = make_string(str, i);
+	res        = string_trim_whitespace(res);
+	return res;
+}
+
+// NOTE(Jeroen): Naively turns a Path into a string, then normalizes it using `path_to_full_path`.
+gb_internal String path_to_full_path(gbAllocator a, Path path) {
+	String temp = path_to_string(heap_allocator(), path);
+	defer (gb_free(heap_allocator(), temp.text));
+
+	return path_to_full_path(a, temp);
+}
+
+// NOTE(Jeroen): Takes a path like "odin" or "W:\Odin", turns it into a full path,
+// and then breaks it into its components to make a Path.
+gb_internal Path path_from_string(gbAllocator a, String const &path) {
+	Path res = {};
+
+	if (path.len == 0) return res;
+
+	String fullpath = path_to_full_path(a, path);
+	defer (gb_free(heap_allocator(), fullpath.text));
+
+	res.basename = directory_from_path(fullpath);	
+	res.basename = copy_string(a, res.basename);
+
+	if (path_is_directory(fullpath)) {
+		// It's a directory. We don't need to tinker with the name and extension.
+		// It could have a superfluous trailing `/`. Remove it if so.
+		if (res.basename.len > 0 && res.basename.text[res.basename.len - 1] == '/') {
+			res.basename.len--;
+		}
+		return res;
+	}
+
+	// Note(Dragos): Is the copy_string required if it's a substring?
+	isize name_start = (res.basename.len > 0) ? res.basename.len + 1 : res.basename.len;
+	res.name         = substring(fullpath, name_start, fullpath.len);
+	res.name         = remove_extension_from_path(res.name);
+	res.name         = copy_string(a, res.name);
+
+	res.ext          = path_extension(fullpath, false); // false says not to include the dot.
+	res.ext          = copy_string(a, res.ext);
+	return res;
+}
+
+// NOTE(Jeroen): Takes a path String and returns the last path element.
+gb_internal String last_path_element(String const &path) {
+	isize count = 0;
+	u8 * start = (u8 *)(&path.text[path.len - 1]);
+	for (isize length = path.len; length > 0 && path.text[length - 1] != '/'; length--) {
+		count++;
+		start--;
+	}
+	if (count > 0) {
+		start++; // Advance past the `/` and return the substring.
+		String res = make_string(start, count);
+		return res;
+	}
+	// Must be a root path like `/` or `C:/`, return empty String.
+	return STR_LIT("");
+}
+
+gb_internal bool path_is_directory(Path path) {
+	String path_string = path_to_full_path(heap_allocator(), path);
+	defer (gb_free(heap_allocator(), path_string.text));
+
+	return path_is_directory(path_string);
+}
+
+struct FileInfo {
+	String name;
+	String fullpath;
+	i64    size;
+	bool   is_dir;
+};
+
+enum ReadDirectoryError {
+	ReadDirectory_None,
+
+	ReadDirectory_InvalidPath,
+	ReadDirectory_NotExists,
+	ReadDirectory_Permission,
+	ReadDirectory_NotDir,
+	ReadDirectory_Empty,
+	ReadDirectory_Unknown,
+
+	ReadDirectory_COUNT,
+};
+
+gb_internal i64 get_file_size(String path) {
+	char *c_str = alloc_cstring(heap_allocator(), path);
+	defer (gb_free(heap_allocator(), c_str));
+
+	gbFile f = {};
+	gbFileError err = gb_file_open(&f, c_str);
+	defer (gb_file_close(&f));
+	if (err != gbFileError_None) {
+		return -1;
+	}
+	return gb_file_size(&f);
+}
+
+
+#if defined(GB_SYSTEM_WINDOWS)
+gb_internal ReadDirectoryError read_directory(String path, Array<FileInfo> *fi) {
+	GB_ASSERT(fi != nullptr);
+
+
+	while (path.len > 0) {
+		Rune end = path[path.len-1];
+		if (end == '/') {
+			path.len -= 1;
+		} else if (end == '\\') {
+			path.len -= 1;
+		} else {
+			break;
+		}
+	}
+
+	if (path.len == 0) {
+		return ReadDirectory_InvalidPath;
+	}
+	{
+		char *c_str = alloc_cstring(temporary_allocator(), path);
+		gbFile f = {};
+		gbFileError file_err = gb_file_open(&f, c_str);
+		defer (gb_file_close(&f));
+
+		switch (file_err) {
+		case gbFileError_Invalid:    return ReadDirectory_InvalidPath;
+		case gbFileError_NotExists:  return ReadDirectory_NotExists;
+		// case gbFileError_Permission: return ReadDirectory_Permission;
+		}
+	}
+
+	if (!path_is_directory(path)) {
+		return ReadDirectory_NotDir;
+	}
+
+
+	gbAllocator a = heap_allocator();
+	char *new_path = gb_alloc_array(a, char, path.len+3);
+	defer (gb_free(a, new_path));
+
+	gb_memmove(new_path, path.text, path.len);
+	gb_memmove(new_path+path.len, "/*", 2);
+	new_path[path.len+2] = 0;
+
+	String np = make_string(cast(u8 *)new_path, path.len+2);
+	String16 wstr = string_to_string16(a, np);
+	defer (gb_free(a, wstr.text));
+
+	WIN32_FIND_DATAW file_data = {};
+	HANDLE find_file = FindFirstFileW(wstr.text, &file_data);
+	if (find_file == INVALID_HANDLE_VALUE) {
+		return ReadDirectory_Unknown;
+	}
+	defer (FindClose(find_file));
+
+	array_init(fi, a, 0, 100);
+
+	do {
+		wchar_t *filename_w = file_data.cFileName;
+		u64 size = cast(u64)file_data.nFileSizeLow;
+		size |= (cast(u64)file_data.nFileSizeHigh) << 32;
+		String name = string16_to_string(a, make_string16_c(filename_w));
+		if (name == "." || name == "..") {
+			gb_free(a, name.text);
+			continue;
+		}
+
+		String filepath = {};
+		filepath.len = path.len+1+name.len;
+		filepath.text = gb_alloc_array(a, u8, filepath.len+1);
+		defer (gb_free(a, filepath.text));
+		gb_memmove(filepath.text, path.text, path.len);
+		gb_memmove(filepath.text+path.len, "/", 1);
+		gb_memmove(filepath.text+path.len+1, name.text, name.len);
+
+		FileInfo info = {};
+		info.name = name;
+		info.fullpath = path_to_full_path(a, filepath);
+		info.size = cast(i64)size;
+		info.is_dir = (file_data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) != 0;
+		array_add(fi, info);
+	} while (FindNextFileW(find_file, &file_data));
+
+	if (fi->count == 0) {
+		return ReadDirectory_Empty;
+	}
+
+	return ReadDirectory_None;
+}
+#elif defined(GB_SYSTEM_LINUX) || defined(GB_SYSTEM_OSX) || defined(GB_SYSTEM_FREEBSD) || defined(GB_SYSTEM_OPENBSD) || defined(GB_SYSTEM_HAIKU)
+
+#include <dirent.h>
+
+gb_internal ReadDirectoryError read_directory(String path, Array<FileInfo> *fi) {
+	GB_ASSERT(fi != nullptr);
+
+	gbAllocator a = heap_allocator();
+
+	char *c_path = alloc_cstring(a, path);
+	defer (gb_free(a, c_path));
+
+	DIR *dir = opendir(c_path);
+	if (!dir) {
+		switch (errno) {
+		case ENOENT:
+			return ReadDirectory_NotExists;
+		case EACCES:
+			return ReadDirectory_Permission;
+		case ENOTDIR:
+			return ReadDirectory_NotDir;
+		default:
+			// ENOMEM: out of memory
+			// EMFILE: per-process limit on open fds reached
+			// ENFILE: system-wide limit on total open files reached
+			return ReadDirectory_Unknown;
+		}
+		GB_PANIC("unreachable");
+	}
+
+	array_init(fi, a, 0, 100);
+
+	for (;;) {
+		struct dirent *entry = readdir(dir);
+		if (entry == nullptr) {
+			break;
+		}
+
+		String name = make_string_c(entry->d_name);
+		if (name == "." || name == "..") {
+			continue;
+		}
+
+		String filepath = {};
+		filepath.len = path.len+1+name.len;
+		filepath.text = gb_alloc_array(a, u8, filepath.len+1);
+		defer (gb_free(a, filepath.text));
+		gb_memmove(filepath.text, path.text, path.len);
+		gb_memmove(filepath.text+path.len, "/", 1);
+		gb_memmove(filepath.text+path.len+1, name.text, name.len);
+		filepath.text[filepath.len] = 0;
+
+
+		struct stat dir_stat = {};
+
+		if (stat((char *)filepath.text, &dir_stat)) {
+			continue;
+		}
+
+		if (S_ISDIR(dir_stat.st_mode)) {
+			continue;
+		}
+
+		i64 size = dir_stat.st_size;
+
+		FileInfo info = {};
+		info.name = name;
+		info.fullpath = path_to_full_path(a, filepath);
+		info.size = size;
+		array_add(fi, info);
+	}
+
+	if (fi->count == 0) {
+		return ReadDirectory_Empty;
+	}
+
+	return ReadDirectory_None;
+}
+
+
+#else
+#error Implement read_directory
+#endif
+
+#if !defined(GB_SYSTEM_WINDOWS)
+gb_internal bool write_directory(String path) {
+	char const *pathname = (char *) path.text;
+
+	if (access(pathname, W_OK) < 0) {
+		return false;
+	}
+
+	return true;
+}
+#else
+gb_internal bool write_directory(String path) {
+	String16 wstr = string_to_string16(heap_allocator(), path);
+	LPCWSTR wdirectory_name = wstr.text;
+
+	HANDLE directory = CreateFileW(wdirectory_name,
+			GENERIC_WRITE,
+			0,
+			NULL,
+			OPEN_EXISTING,
+			FILE_FLAG_BACKUP_SEMANTICS,
+			NULL);
+
+	if (directory == INVALID_HANDLE_VALUE) {
+		DWORD error_code = GetLastError();
+		if (error_code == ERROR_ACCESS_DENIED) {
+			return false;
+		}
+	}
+
+	CloseHandle(directory);
+	return true;
+}
+#endif
diff --git a/src/threading.cpp b/src/threading.cpp
index 725b58c89..a469435d2 100644
--- a/src/threading.cpp
+++ b/src/threading.cpp
@@ -492,6 +492,8 @@ gb_internal u32 thread_current_id(void) {
 	__asm__("mov %%fs:0x10,%0" : "=r"(thread_id));
 #elif defined(GB_SYSTEM_LINUX)
 	thread_id = gettid();
+#elif defined(GB_SYSTEM_HAIKU)
+	thread_id = find_thread(NULL);
 #else
 	#error Unsupported architecture for thread_current_id()
 #endif
@@ -831,8 +833,178 @@ gb_internal void futex_wait(Futex *f, Footex val) {
 		WaitOnAddress(f, (void *)&val, sizeof(val), INFINITE);
 	} while (f->load() == val);
 }
+
+#elif defined(GB_SYSTEM_HAIKU)
+
+// Futex implementation taken from https://tavianator.com/2023/futex.html
+
+#include <pthread.h>
+#include <atomic>
+
+struct _Spinlock {
+	std::atomic_flag state;
+
+	void init() {
+		state.clear();
+	}
+
+	void lock() {
+		while (state.test_and_set(std::memory_order_acquire)) {
+			#if defined(GB_CPU_X86)
+			_mm_pause();
+			#else
+			(void)0; // spin...
+			#endif
+		}
+	}
+
+	void unlock() {
+		state.clear(std::memory_order_release);
+	}
+};
+
+struct Futex_Waitq;
+ 
+struct Futex_Waiter {
+	_Spinlock lock;
+	pthread_t thread;
+	Futex *futex;
+	Futex_Waitq *waitq;
+	Futex_Waiter *prev, *next;	
+};
+ 
+struct Futex_Waitq {
+	_Spinlock lock;
+	Futex_Waiter list;
+ 
+	void init() {
+		auto head = &list;
+		head->prev = head->next = head;
+	}
+};
+
+// FIXME: This approach may scale badly in the future,
+// possible solution - hash map (leads to deadlocks now).
+ 
+Futex_Waitq g_waitq = {
+	.lock = ATOMIC_FLAG_INIT,
+	.list = {
+		.prev = &g_waitq.list,
+		.next = &g_waitq.list,
+	},
+};
+ 
+Futex_Waitq *get_waitq(Futex *f) {
+	// Future hash map method...
+	return &g_waitq;
+}
+ 
+void futex_signal(Futex *f) {
+	auto waitq = get_waitq(f);
+ 
+	waitq->lock.lock();
+ 
+	auto head = &waitq->list;
+	for (auto waiter = head->next; waiter != head; waiter = waiter->next) {
+		if (waiter->futex != f) {
+			continue;
+		}
+		waitq->lock.unlock();
+		pthread_kill(waiter->thread, SIGCONT);
+		return;
+	}
+ 
+	waitq->lock.unlock();
+}
+ 
+void futex_broadcast(Futex *f) {
+	auto waitq = get_waitq(f);
+ 
+	waitq->lock.lock();
+ 
+	auto head = &waitq->list;
+	for (auto waiter = head->next; waiter != head; waiter = waiter->next) {
+		if (waiter->futex != f) {
+			continue;
+		}
+		if (waiter->next == head) {
+			waitq->lock.unlock();
+			pthread_kill(waiter->thread, SIGCONT);
+			return;
+		} else {
+			pthread_kill(waiter->thread, SIGCONT);
+		}
+	}
+ 
+	waitq->lock.unlock();
+}
+ 
+void futex_wait(Futex *f, Footex val) {
+	Futex_Waiter waiter;
+	waiter.thread = pthread_self();
+	waiter.futex = f;
+
+	auto waitq = get_waitq(f);
+	while (waitq->lock.state.test_and_set(std::memory_order_acquire)) {
+		if (f->load(std::memory_order_relaxed) != val) {
+			return;
+		}
+		#if defined(GB_CPU_X86)
+		_mm_pause();
+		#else
+		(void)0; // spin...
+		#endif
+	}
+
+	waiter.waitq = waitq;
+	waiter.lock.init();
+	waiter.lock.lock();
+ 
+	auto head = &waitq->list;
+	waiter.prev = head->prev;
+	waiter.next = head;
+	waiter.prev->next = &waiter;
+	waiter.next->prev = &waiter;
+ 
+	waiter.prev->next = &waiter;
+	waiter.next->prev = &waiter;
+ 
+	sigset_t old_mask, mask;
+	sigemptyset(&mask);
+	sigaddset(&mask, SIGCONT);
+	pthread_sigmask(SIG_BLOCK, &mask, &old_mask);
+
+	if (f->load(std::memory_order_relaxed) == val) {
+			waiter.lock.unlock();
+			waitq->lock.unlock();
+
+			int sig;
+			sigwait(&mask, &sig);
+
+			waitq->lock.lock();
+			waiter.lock.lock();
+
+			while (waitq != waiter.waitq) {
+				auto req = waiter.waitq;
+				waiter.lock.unlock();
+				waitq->lock.unlock();
+				waitq = req;
+				waitq->lock.lock();
+				waiter.lock.lock();
+			}
+	}
+ 
+	waiter.prev->next = waiter.next;
+	waiter.next->prev = waiter.prev;
+ 
+	pthread_sigmask(SIG_SETMASK, &old_mask, NULL);
+ 
+	waiter.lock.unlock();
+	waitq->lock.unlock();
+}
+
 #endif
 
 #if defined(GB_SYSTEM_WINDOWS)
 	#pragma warning(pop)
-#endif
\ No newline at end of file
+#endif
diff --git a/src/tilde.cpp b/src/tilde.cpp
index 06428f317..4fc7d1c9b 100644
--- a/src/tilde.cpp
+++ b/src/tilde.cpp
@@ -825,6 +825,7 @@ gb_internal bool cg_generate_code(Checker *c, LinkerData *linker_data) {
 		case TargetOs_essence:
 		case TargetOs_freebsd:
 		case TargetOs_openbsd:
+		case TargetOs_haiku:
 			debug_format = TB_DEBUGFMT_DWARF;
 			break;
 		}
diff --git a/vendor/raylib/raylib.odin b/vendor/raylib/raylib.odin
index feb8d05a5..51b43c565 100644
--- a/vendor/raylib/raylib.odin
+++ b/vendor/raylib/raylib.odin
@@ -86,7 +86,6 @@ import "core:fmt"
 import "core:mem"
 import "core:strings"
 
-USE_LINALG :: #config(RAYLIB_USE_LINALG, true)
 import "core:math/linalg"
 _ :: linalg
 
@@ -213,39 +212,19 @@ BLANK      :: Color{ 0, 0, 0, 0 }           // Blank (Transparent)
 MAGENTA    :: Color{ 255, 0, 255, 255 }     // Magenta
 RAYWHITE   :: Color{ 245, 245, 245, 255 }   // My own White (raylib logo)
 
+// Vector2 type
+Vector2 :: linalg.Vector2f32
+// Vector3 type
+Vector3 :: linalg.Vector3f32
+// Vector4 type
+Vector4 :: linalg.Vector4f32
 
-when USE_LINALG {
-	// Vector2 type
-	Vector2 :: linalg.Vector2f32
-	// Vector3 type
-	Vector3 :: linalg.Vector3f32
-	// Vector4 type
-	Vector4 :: linalg.Vector4f32
+// Quaternion type
+Quaternion :: linalg.Quaternionf32
 
-	// Quaternion type
-	Quaternion :: linalg.Quaternionf32
+// Matrix type (OpenGL style 4x4 - right handed, stored column major)
+Matrix :: linalg.Matrix4x4f32
 
-	// Matrix type (OpenGL style 4x4 - right handed, column major)
-	Matrix :: linalg.Matrix4x4f32
-} else {
-	// Vector2 type
-	Vector2 :: distinct [2]f32
-	// Vector3 type
-	Vector3 :: distinct [3]f32
-	// Vector4 type
-	Vector4 :: distinct [4]f32
-
-	// Quaternion type
-	Quaternion :: distinct quaternion128
-	
-	// Matrix, 4x4 components, column major, OpenGL style, right handed
-	Matrix :: struct {
-		m0, m4, m8, m12:  f32, // Matrix first row (4 components)
-		m1, m5, m9, m13:  f32, // Matrix second row (4 components)
-		m2, m6, m10, m14: f32, // Matrix third row (4 components)
-		m3, m7, m11, m15: f32, // Matrix fourth row (4 components)
-	}
-}
 
 // Color, 4 components, R8G8B8A8 (32bit)
 //
diff --git a/vendor/raylib/raymath.odin b/vendor/raylib/raymath.odin
index 9770ecfb1..c657152c1 100644
--- a/vendor/raylib/raymath.odin
+++ b/vendor/raylib/raymath.odin
@@ -85,33 +85,33 @@ Vector2SubtractValue :: proc "c" (v: Vector2, value: f32) -> Vector2 {
 	return v - value
 }
 // Calculate vector length
-@(require_results, deprecated="Prefer linalg.length(v)")
+@(require_results)
 Vector2Length :: proc "c" (v: Vector2) -> f32 {
 	return linalg.length(v)
 }
 // Calculate vector square length
-@(require_results, deprecated="Prefer linalg.length2(v)")
+@(require_results)
 Vector2LengthSqr :: proc "c" (v: Vector2) -> f32 {
 	return linalg.length2(v)
 }
 // Calculate two vectors dot product
-@(require_results, deprecated="Prefer linalg.dot(v1, v2)")
+@(require_results)
 Vector2DotProduct :: proc "c" (v1, v2: Vector2) -> f32 {
 	return linalg.dot(v1, v2)
 }
 // Calculate distance between two vectors
-@(require_results, deprecated="Prefer linalg.distance(v1, v2)")
+@(require_results)
 Vector2Distance :: proc "c" (v1, v2: Vector2) -> f32 {
 	return linalg.distance(v1, v2)
 }
 // Calculate square distance between two vectors
-@(require_results, deprecated="Prefer linalg.length2(v2-v1)")
+@(require_results)
 Vector2DistanceSqrt :: proc "c" (v1, v2: Vector2) -> f32 {
 	return linalg.length2(v2-v1)
 }
 // Calculate angle between two vectors
 // NOTE: Angle is calculated from origin point (0, 0)
-@(require_results, deprecated="Prefer linalg.angle_between(v1, v2)")
+@(require_results)
 Vector2Angle :: proc "c" (v1, v2: Vector2) -> f32 {
 	return linalg.angle_between(v1, v2)
 }
@@ -146,7 +146,7 @@ Vector2Divide :: proc "c" (v1, v2: Vector2) -> Vector2 {
 	return v1 / v2
 }
 // Normalize provided vector
-@(require_results, deprecated="Prefer linalg.normalize0(v)")
+@(require_results)
 Vector2Normalize :: proc "c" (v: Vector2) -> Vector2 {
 	return linalg.normalize0(v)
 }
@@ -270,38 +270,38 @@ Vector3SubtractValue :: proc "c" (v: Vector3, value: f32) -> Vector3 {
 	return v - value
 }
 // Calculate vector length
-@(require_results, deprecated="Prefer linalg.length(v)")
+@(require_results)
 Vector3Length :: proc "c" (v: Vector3) -> f32 {
 	return linalg.length(v)
 }
 // Calculate vector square length
-@(require_results, deprecated="Prefer linalg.length2(v)")
+@(require_results)
 Vector3LengthSqr :: proc "c" (v: Vector3) -> f32 {
 	return linalg.length2(v)
 }
 // Calculate two vectors dot product
-@(require_results, deprecated="Prefer linalg.dot(v1, v2)")
+@(require_results)
 Vector3DotProduct :: proc "c" (v1, v2: Vector3) -> f32 {
 	return linalg.dot(v1, v2)
 }
 // Calculate two vectors dot product
-@(require_results, deprecated="Prefer linalg.cross(v1, v2)")
+@(require_results)
 Vector3CrossProduct :: proc "c" (v1, v2: Vector3) -> Vector3 {
 	return linalg.cross(v1, v2)
 }
 // Calculate distance between two vectors
-@(require_results, deprecated="Prefer linalg.distance(v1, v2)")
+@(require_results)
 Vector3Distance :: proc "c" (v1, v2: Vector3) -> f32 {
 	return linalg.distance(v1, v2)
 }
 // Calculate square distance between two vectors
-@(require_results, deprecated="Prefer linalg.length2(v2-v1)")
+@(require_results)
 Vector3DistanceSqrt :: proc "c" (v1, v2: Vector3) -> f32 {
 	return linalg.length2(v2-v1)
 }
 // Calculate angle between two vectors
 // NOTE: Angle is calculated from origin point (0, 0)
-@(require_results, deprecated="Prefer linalg.angle_between(v1, v2)")
+@(require_results)
 Vector3Angle :: proc "c" (v1, v2: Vector3) -> f32 {
 	return linalg.angle_between(v1, v2)
 }
@@ -336,7 +336,7 @@ Vector3Divide :: proc "c" (v1, v2: Vector3) -> Vector3 {
 	return v1 / v2
 }
 // Normalize provided vector
-@(require_results, deprecated="Prefer linalg.normalize0(v)")
+@(require_results)
 Vector3Normalize :: proc "c" (v: Vector3) -> Vector3 {
 	return linalg.normalize0(v)
 }
@@ -364,7 +364,7 @@ Vector3OrthoNormalize :: proc "c" (v1, v2: ^Vector3) {
 }
 
 // Transform a vector by quaternion rotation
-@(require_results, deprecated="Prefer linalg.mul(q, v")
+@(require_results)
 Vector3RotateByQuaternion :: proc "c" (v: Vector3, q: Quaternion) -> Vector3 {
 	return linalg.mul(q, v)
 }
@@ -480,12 +480,12 @@ Vector3Equals :: proc "c" (p, q: Vector3) -> bool {
 }
 
 
-@(require_results, deprecated="Prefer linalg.min(v1, v2)")
+@(require_results)
 Vector3Min :: proc "c" (v1, v2: Vector3) -> Vector3 {
 	return linalg.min(v1, v2)
 }
 
-@(require_results, deprecated="Prefer linalg.max(v1, v2)")
+@(require_results)
 Vector3Max :: proc "c" (v1, v2: Vector3) -> Vector3 {
 	return linalg.max(v1, v2)
 }
@@ -539,25 +539,25 @@ Vector3Unproject :: proc "c" (source: Vector3, projection: Matrix, view: Matrix)
 //----------------------------------------------------------------------------------
 
 // Compute matrix determinant
-@(require_results, deprecated="Prefer linalg.determinant(mat)")
+@(require_results)
 MatrixDeterminant :: proc "c" (mat: Matrix) -> f32 {
 	return linalg.determinant(mat)
 }
 
 // Get the trace of the matrix (sum of the values along the diagonal)
-@(require_results, deprecated="Prefer linalg.trace(mat)")
+@(require_results)
 MatrixTrace :: proc "c" (mat: Matrix) -> f32 {
 	return linalg.trace(mat)
 }
 
 // Transposes provided matrix
-@(require_results, deprecated="Prefer linalg.transpose(mat)")
+@(require_results)
 MatrixTranspose :: proc "c" (mat: Matrix) -> Matrix {
 	return linalg.transpose(mat)
 }
 
 // Invert provided matrix
-@(require_results, deprecated="Prefer linalg.inverse(mat)")
+@(require_results)
 MatrixInvert :: proc "c" (mat: Matrix) -> Matrix {
 	return linalg.inverse(mat)
 }
@@ -704,7 +704,7 @@ QuaternionLength :: proc "c" (q: Quaternion) -> f32 {
 	return abs(q)
 }
 // Normalize provided quaternion
-@(require_results, deprecated="Prefer linalg.normalize0(q)")
+@(require_results)
 QuaternionNormalize :: proc "c" (q: Quaternion) -> Quaternion {
 	return linalg.normalize0(q)
 }