Merge tag 'dev-2025-04'

2026-07-25 08:57:55 +00:00 · 2025-04-14 14:36:56 -04:00
parent 5bd4b6fb21 d9f990d42e
commit 18683c6172
158 changed files with 8269 additions and 1630 deletions
@@ -93,14 +93,14 @@ jobs:
      - name: Download LLVM (MacOS Intel)
        if: matrix.os == 'macos-13'
        run: |
-          brew install llvm@18 lua@5.4
-          echo "/usr/local/opt/llvm@18/bin" >> $GITHUB_PATH
+          brew update
+          brew install llvm@20 lua@5.4 lld

      - name: Download LLVM (MacOS ARM)
        if: matrix.os == 'macos-14'
        run: |
-          brew install llvm@18 wasmtime lua@5.4
-          echo "/opt/homebrew/opt/llvm@18/bin" >> $GITHUB_PATH
+          brew update
+          brew install llvm@20 wasmtime lua@5.4 lld

      - name: Build Odin
        run: ./build_odin.sh release
@@ -49,12 +49,12 @@ jobs:
      - uses: actions/checkout@v4
      - uses: jirutka/setup-alpine@v1
        with:
-          branch: v3.20
+          branch: edge
      - name: (Linux) Download LLVM
        run: |
          apk add --no-cache \
-          musl-dev llvm18-dev clang18 git mold lz4 \
-          libxml2-static llvm18-static zlib-static zstd-static \
+          musl-dev llvm20-dev clang20 git mold lz4 \
+          libxml2-static llvm20-static zlib-static zstd-static \
          make
        shell: alpine.sh --root {0}
      - name: build odin
@@ -93,8 +93,9 @@ jobs:
      - uses: actions/checkout@v4
      - name: Download LLVM and setup PATH
        run: |
-          brew install llvm@18 dylibbundler
-          echo "/usr/local/opt/llvm@18/bin" >> $GITHUB_PATH
+          brew update
+          brew install llvm@20 dylibbundler lld
+
      - name: build odin
        # These -L makes the linker prioritize system libraries over LLVM libraries, this is mainly to
        # not link with libunwind bundled with LLVM but link with libunwind on the system.
@@ -130,8 +131,9 @@ jobs:
      - uses: actions/checkout@v4
      - name: Download LLVM and setup PATH
        run: |
-          brew install llvm@18 dylibbundler
-          echo "/opt/homebrew/opt/llvm@18/bin" >> $GITHUB_PATH
+          brew update
+          brew install llvm@20 dylibbundler lld
+
      - name: build odin
        # These -L makes the linker prioritize system libraries over LLVM libraries, this is mainly to
        # not link with libunwind bundled with LLVM but link with libunwind on the system.
@@ -9,7 +9,7 @@ foreign libc {
 	@(link_name="write")
 	_unix_write :: proc(fd: i32, buf: rawptr, size: int) -> int ---

-	when ODIN_OS == .NetBSD {
+	when ODIN_OS == .NetBSD || ODIN_OS == .OpenBSD {
 		@(link_name="__errno") __error :: proc() -> ^i32 ---
 	} else {
 		__error :: proc() -> ^i32 ---
@@ -4,12 +4,12 @@ setlocal EnableDelayedExpansion

 where /Q cl.exe || (
 	set __VSCMD_ARG_NO_LOGO=1
-	for /f "tokens=*" %%i in ('"C:\Program Files (x86)\Microsoft Visual Studio\Installer\vswhere.exe" -latest -requires Microsoft.VisualStudio.Workload.NativeDesktop -property installationPath') do set VS=%%i
+	for /f "tokens=*" %%i in ('"C:\Program Files (x86)\Microsoft Visual Studio\Installer\vswhere.exe" -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath') do set VS=%%i
 	if "!VS!" equ "" (
-		echo ERROR: Visual Studio installation not found
+		echo ERROR: MSVC installation not found
 		exit /b 1
 	)
-	call "!VS!\VC\Auxiliary\Build\vcvarsall.bat" amd64 || exit /b 1
+	call "!VS!\Common7\Tools\vsdevcmd.bat" -arch=x64 -host_arch=x64 || exit /b 1
 )

 if "%VSCMD_ARG_TGT_ARCH%" neq "x64" (
@@ -152,4 +152,4 @@ if %release_mode% EQU 0 echo: & echo Debug compiler built. Note: run "build.bat

 del *.obj > NUL 2> NUL

-:end_of_build
+:end_of_build
@@ -25,7 +25,7 @@ error() {

 # Brew advises people not to add llvm to their $PATH, so try and use brew to find it.
 if [ -z "$LLVM_CONFIG" ] &&  [ -n "$(command -v brew)" ]; then
-    if   [ -n "$(command -v $(brew --prefix llvm)/bin/llvm-config)"    ]; then LLVM_CONFIG="$(brew --prefix llvm)/bin/llvm-config"
+    if   [ -n "$(command -v $(brew --prefix llvm@20)/bin/llvm-config)" ]; then LLVM_CONFIG="$(brew --prefix llvm@20)/bin/llvm-config"
    elif [ -n "$(command -v $(brew --prefix llvm@19)/bin/llvm-config)" ]; then LLVM_CONFIG="$(brew --prefix llvm@19)/bin/llvm-config"
    elif [ -n "$(command -v $(brew --prefix llvm@18)/bin/llvm-config)" ]; then LLVM_CONFIG="$(brew --prefix llvm@18)/bin/llvm-config"
    elif [ -n "$(command -v $(brew --prefix llvm@17)/bin/llvm-config)" ]; then LLVM_CONFIG="$(brew --prefix llvm@17)/bin/llvm-config"
@@ -1,8 +1,8 @@
 #!/usr/bin/env sh
 # Intended for use in Alpine containers, see the "nightly" Github action for a list of dependencies

-CXX="clang++-18"
-LLVM_CONFIG="llvm-config-18"
+CXX="clang++-20"
+LLVM_CONFIG="llvm-config-20"

 DISABLED_WARNINGS="-Wno-switch -Wno-macro-redefined -Wno-unused-value"

@@ -25,4 +25,5 @@ GHASH_BLOCK_SIZE :: 16
 GHASH_TAG_SIZE :: 16

 // RCON is the AES keyschedule round constants.
+@(rodata)
 RCON := [10]byte{0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36}
@@ -22,8 +22,6 @@

 package aes_ct64

-import "base:intrinsics"
-
 // Bitsliced AES for 64-bit general purpose (integer) registers.  Each
 // invocation will process up to 4 blocks at a time.  This implementation
 // is derived from the BearSSL ct64 code, and distributed under a 1-clause
@@ -212,11 +210,8 @@ orthogonalize :: proc "contextless" (q: ^[8]u64) {
 }

@(require_results)
-interleave_in :: proc "contextless" (w: []u32) -> (q0, q1: u64) #no_bounds_check {
-	if len(w) < 4 {
-		intrinsics.trap()
-	}
-	x0, x1, x2, x3 := u64(w[0]), u64(w[1]), u64(w[2]), u64(w[3])
+interleave_in :: proc "contextless" (w0, w1, w2, w3: u32) -> (q0, q1: u64) #no_bounds_check {
+	x0, x1, x2, x3 := u64(w0), u64(w1), u64(w2), u64(w3)
 	x0 |= (x0 << 16)
 	x1 |= (x1 << 16)
 	x2 |= (x2 << 16)
@@ -22,12 +22,8 @@

 package aes_ct64

-import "base:intrinsics"
-
 add_round_key :: proc "contextless" (q: ^[8]u64, sk: []u64) #no_bounds_check {
-	if len(sk) < 8 {
-		intrinsics.trap()
-	}
+	ensure_contextless(len(sk) >= 8, "aes/ct64: invalid round key size")

 	q[0] ~= sk[0]
 	q[1] ~= sk[1]
@@ -22,7 +22,6 @@

 package aes_ct64

-import "base:intrinsics"
 import "core:crypto/_aes"
 import "core:encoding/endian"
 import "core:mem"
@@ -42,7 +41,7 @@ sub_word :: proc "contextless" (x: u32) -> u32 {
 }

@(private, require_results)
-keysched :: proc(comp_skey: []u64, key: []byte) -> int {
+keysched :: proc "contextless" (comp_skey: []u64, key: []byte) -> int {
 	num_rounds, key_len := 0, len(key)
 	switch key_len {
 	case _aes.KEY_SIZE_128:
@@ -52,7 +51,7 @@ keysched :: proc(comp_skey: []u64, key: []byte) -> int {
 	case _aes.KEY_SIZE_256:
 		num_rounds = _aes.ROUNDS_256
 	case:
-		panic("crypto/aes: invalid AES key size")
+		panic_contextless("crypto/aes: invalid AES key size")
 	}

 	skey: [60]u32 = ---
@@ -78,7 +77,7 @@ keysched :: proc(comp_skey: []u64, key: []byte) -> int {

 	q: [8]u64 = ---
 	for i, j := 0, 0; i < nkf; i, j = i + 4, j + 2 {
-		q[0], q[4] = interleave_in(skey[i:])
+		q[0], q[4] = interleave_in(skey[i], skey[i+1], skey[i+2], skey[i+3])
 		q[1] = q[0]
 		q[2] = q[0]
 		q[3] = q[0]
@@ -123,57 +122,3 @@ skey_expand :: proc "contextless" (skey, comp_skey: []u64, num_rounds: int) {
 		skey[v + 3] = (x3 << 4) - x3
 	}
 }
-
-orthogonalize_roundkey :: proc "contextless" (qq: []u64, key: []byte) {
-	if len(qq) < 8 || len(key) != 16 {
-		intrinsics.trap()
-	}
-
-	skey: [4]u32 = ---
-	skey[0] = endian.unchecked_get_u32le(key[0:])
-	skey[1] = endian.unchecked_get_u32le(key[4:])
-	skey[2] = endian.unchecked_get_u32le(key[8:])
-	skey[3] = endian.unchecked_get_u32le(key[12:])
-
-	q: [8]u64 = ---
-	q[0], q[4] = interleave_in(skey[:])
-	q[1] = q[0]
-	q[2] = q[0]
-	q[3] = q[0]
-	q[5] = q[4]
-	q[6] = q[4]
-	q[7] = q[4]
-	orthogonalize(&q)
-
-	comp_skey: [2]u64 = ---
-	comp_skey[0] =
-		(q[0] & 0x1111111111111111) |
-		(q[1] & 0x2222222222222222) |
-		(q[2] & 0x4444444444444444) |
-		(q[3] & 0x8888888888888888)
-	comp_skey[1] =
-		(q[4] & 0x1111111111111111) |
-		(q[5] & 0x2222222222222222) |
-		(q[6] & 0x4444444444444444) |
-		(q[7] & 0x8888888888888888)
-
-	for x, u in comp_skey {
-		x0 := x
-		x1, x2, x3 := x0, x0, x0
-		x0 &= 0x1111111111111111
-		x1 &= 0x2222222222222222
-		x2 &= 0x4444444444444444
-		x3 &= 0x8888888888888888
-		x1 >>= 1
-		x2 >>= 2
-		x3 >>= 3
-		qq[u * 4 + 0] = (x0 << 4) - x0
-		qq[u * 4 + 1] = (x1 << 4) - x1
-		qq[u * 4 + 2] = (x2 << 4) - x2
-		qq[u * 4 + 3] = (x3 << 4) - x3
-	}
-
-	mem.zero_explicit(&skey, size_of(skey))
-	mem.zero_explicit(&q, size_of(q))
-	mem.zero_explicit(&comp_skey, size_of(comp_skey))
-}
@@ -22,7 +22,6 @@

 package aes_ct64

-import "base:intrinsics"
 import "core:crypto/_aes"
 import "core:encoding/endian"

@@ -64,9 +63,8 @@ rev64 :: proc "contextless" (x: u64) -> u64 {
 // Note: `dst` is both an input and an output, to support easy implementation
 // of GCM.
 ghash :: proc "contextless" (dst, key, data: []byte) {
-	if len(dst) != _aes.GHASH_BLOCK_SIZE || len(key) != _aes.GHASH_BLOCK_SIZE {
-		intrinsics.trap()
-	}
+	ensure_contextless(len(dst) == _aes.GHASH_BLOCK_SIZE)
+	ensure_contextless(len(key) == _aes.GHASH_BLOCK_SIZE)

 	buf := data
 	l := len(buf)
@@ -1,60 +1,61 @@
 package aes_ct64

-import "base:intrinsics"
 import "core:crypto/_aes"
 import "core:encoding/endian"

-load_blockx1 :: proc "contextless" (q: ^[8]u64, src: []byte) {
-	if len(src) != _aes.BLOCK_SIZE {
-		intrinsics.trap()
-	}
-
-	w: [4]u32 = ---
-	w[0] = endian.unchecked_get_u32le(src[0:])
-	w[1] = endian.unchecked_get_u32le(src[4:])
-	w[2] = endian.unchecked_get_u32le(src[8:])
-	w[3] = endian.unchecked_get_u32le(src[12:])
-	q[0], q[4] = interleave_in(w[:])
-	orthogonalize(q)
+@(require_results)
+load_interleaved :: proc "contextless" (src: []byte) -> (u64, u64) #no_bounds_check {
+	w0 := endian.unchecked_get_u32le(src[0:])
+	w1 := endian.unchecked_get_u32le(src[4:])
+	w2 := endian.unchecked_get_u32le(src[8:])
+	w3 := endian.unchecked_get_u32le(src[12:])
+	return interleave_in(w0, w1, w2, w3)
 }

-store_blockx1 :: proc "contextless" (dst: []byte, q: ^[8]u64) {
-	if len(dst) != _aes.BLOCK_SIZE {
-		intrinsics.trap()
-	}
-
-	orthogonalize(q)
-	w0, w1, w2, w3 := interleave_out(q[0], q[4])
+store_interleaved :: proc "contextless" (dst: []byte, a0, a1: u64) #no_bounds_check {
+	w0, w1, w2, w3 := interleave_out(a0, a1)
 	endian.unchecked_put_u32le(dst[0:], w0)
 	endian.unchecked_put_u32le(dst[4:], w1)
 	endian.unchecked_put_u32le(dst[8:], w2)
 	endian.unchecked_put_u32le(dst[12:], w3)
 }

+@(require_results)
+xor_interleaved :: #force_inline proc "contextless" (a0, a1, b0, b1: u64) -> (u64, u64) {
+	return a0 ~ b0, a1 ~ b1
+}
+
+@(require_results)
+and_interleaved :: #force_inline proc "contextless" (a0, a1, b0, b1: u64) -> (u64, u64) {
+	return a0 & b0, a1 & b1
+}
+
+load_blockx1 :: proc "contextless" (q: ^[8]u64, src: []byte) {
+	ensure_contextless(len(src) == _aes.BLOCK_SIZE, "aes/ct64: invalid block size")
+
+	q[0], q[4] = #force_inline load_interleaved(src)
+	orthogonalize(q)
+}
+
+store_blockx1 :: proc "contextless" (dst: []byte, q: ^[8]u64) {
+	ensure_contextless(len(dst) == _aes.BLOCK_SIZE, "aes/ct64: invalid block size")
+
+	orthogonalize(q)
+	#force_inline store_interleaved(dst, q[0], q[4])
+}
+
 load_blocks :: proc "contextless" (q: ^[8]u64, src: [][]byte) {
-	if n := len(src); n > STRIDE || n == 0 {
-		intrinsics.trap()
-	}
+	ensure_contextless(len(src) == 0 || len(src) <= STRIDE, "aes/ct64: invalid block(s) size")

-	w: [4]u32 = ---
 	for s, i in src {
-		if len(s) != _aes.BLOCK_SIZE {
-			intrinsics.trap()
-		}
-
-		w[0] = endian.unchecked_get_u32le(s[0:])
-		w[1] = endian.unchecked_get_u32le(s[4:])
-		w[2] = endian.unchecked_get_u32le(s[8:])
-		w[3] = endian.unchecked_get_u32le(s[12:])
-		q[i], q[i + 4] = interleave_in(w[:])
+		ensure_contextless(len(s) == _aes.BLOCK_SIZE, "aes/ct64: invalid block size")
+		q[i], q[i + 4] = #force_inline load_interleaved(s)
 	}
 	orthogonalize(q)
 }

 store_blocks :: proc "contextless" (dst: [][]byte, q: ^[8]u64) {
-	if n := len(dst); n > STRIDE || n == 0 {
-		intrinsics.trap()
-	}
+	ensure_contextless(len(dst) == 0 || len(dst) <= STRIDE, "aes/ct64: invalid block(s) size")

 	orthogonalize(q)
 	for d, i in dst {
@@ -62,14 +63,7 @@ store_blocks :: proc "contextless" (dst: [][]byte, q: ^[8]u64) {
 		if d == nil {
 			break
 		}
-		if len(d) != _aes.BLOCK_SIZE {
-			intrinsics.trap()
-		}
-
-		w0, w1, w2, w3 := interleave_out(q[i], q[i + 4])
-		endian.unchecked_put_u32le(d[0:], w0)
-		endian.unchecked_put_u32le(d[4:], w1)
-		endian.unchecked_put_u32le(d[8:], w2)
-		endian.unchecked_put_u32le(d[12:], w3)
+		ensure_contextless(len(d) == _aes.BLOCK_SIZE, "aes/ct64: invalid block size")
+		#force_inline store_interleaved(d, q[i], q[i + 4])
 	}
 }
@@ -52,7 +52,7 @@ GHASH_STRIDE_BYTES_HW :: GHASH_STRIDE_HW * _aes.GHASH_BLOCK_SIZE
 // that it is right-shifted by 1 bit. The left-shift is relatively
 // inexpensive, and it can be mutualised.
 //
-// Since SSE2 opcodes do not have facilities for shitfting full 128-bit
+// Since SSE2 opcodes do not have facilities for shifting full 128-bit
 // values with bit precision, we have to break down values into 64-bit
 // chunks. We number chunks from 0 to 3 in left to right order.

@@ -155,7 +155,7 @@ square_f128 :: #force_inline proc "contextless" (kw: x86.__m128i) -> (x86.__m128
@(enable_target_feature = "sse2,ssse3,pclmul")
 ghash :: proc "contextless" (dst, key, data: []byte) #no_bounds_check {
 	if len(dst) != _aes.GHASH_BLOCK_SIZE || len(key) != _aes.GHASH_BLOCK_SIZE {
-		intrinsics.trap()
+		panic_contextless("aes/ghash: invalid dst or key size")
 	}

 	// Note: BearSSL opts to copy the remainder into a zero-filled
@@ -18,6 +18,8 @@ BLAKE2S_SIZE :: 32
 BLAKE2B_BLOCK_SIZE :: 128
 BLAKE2B_SIZE :: 64

+MAX_SIZE :: 255
+
 Blake2s_Context :: struct {
 	h:            [8]u32,
 	t:            [2]u32,
@@ -68,13 +70,13 @@ Blake2_Tree :: struct {
 	is_last_node:    bool,
 }

-@(private)
+@(private, rodata)
 BLAKE2S_IV := [8]u32 {
 	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
 	0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19,
 }

-@(private)
+@(private, rodata)
 BLAKE2B_IV := [8]u64 {
 	0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
 	0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
@@ -82,16 +84,13 @@ BLAKE2B_IV := [8]u64 {
 	0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
 }

-init :: proc(ctx: ^$T, cfg: ^Blake2_Config) {
+init :: proc "contextless" (ctx: ^$T, cfg: ^Blake2_Config) {
 	when T == Blake2s_Context {
 		max_size :: BLAKE2S_SIZE
 	} else when T == Blake2b_Context {
 		max_size :: BLAKE2B_SIZE
 	}
-
-	if cfg.size > max_size {
-		panic("blake2: requested output size exceeeds algorithm max")
-	}
+	ensure_contextless(cfg.size <= max_size, "blake2: requested output size exceeeds algorithm max")

 	// To save having to allocate a scratch buffer, use the internal
 	// data buffer (`ctx.x`), as it is exactly the correct size.
@@ -167,8 +166,8 @@ init :: proc(ctx: ^$T, cfg: ^Blake2_Config) {
 	ctx.is_initialized = true
 }

-update :: proc(ctx: ^$T, p: []byte) {
-	assert(ctx.is_initialized)
+update :: proc "contextless" (ctx: ^$T, p: []byte) {
+	ensure_contextless(ctx.is_initialized)

 	p := p
 	when T == Blake2s_Context {
@@ -195,8 +194,8 @@ update :: proc(ctx: ^$T, p: []byte) {
 	ctx.nx += copy(ctx.x[ctx.nx:], p)
 }

-final :: proc(ctx: ^$T, hash: []byte, finalize_clone: bool = false) {
-	assert(ctx.is_initialized)
+final :: proc "contextless" (ctx: ^$T, hash: []byte, finalize_clone: bool = false) {
+	ensure_contextless(ctx.is_initialized)

 	ctx := ctx
 	if finalize_clone {
@@ -206,24 +205,19 @@ final :: proc(ctx: ^$T, hash: []byte, finalize_clone: bool = false) {
 	}
 	defer(reset(ctx))

+	ensure_contextless(len(hash) >= int(ctx.size), "crypto/blake2: invalid destination digest size")
 	when T == Blake2s_Context {
-		if len(hash) < int(ctx.size) {
-			panic("crypto/blake2s: invalid destination digest size")
-		}
 		blake2s_final(ctx, hash)
 	} else when T == Blake2b_Context {
-		if len(hash) < int(ctx.size) {
-			panic("crypto/blake2b: invalid destination digest size")
-		}
 		blake2b_final(ctx, hash)
 	}
 }

-clone :: proc(ctx, other: ^$T) {
+clone :: proc "contextless" (ctx, other: ^$T) {
 	ctx^ = other^
 }

-reset :: proc(ctx: ^$T) {
+reset :: proc "contextless" (ctx: ^$T) {
 	if !ctx.is_initialized {
 		return
 	}
@@ -1,6 +1,5 @@
 package _chacha20

-import "base:intrinsics"
 import "core:encoding/endian"
 import "core:math/bits"
 import "core:mem"
@@ -46,9 +45,8 @@ Context :: struct {
 // derivation is expected to be handled by the caller, so that the
 // HChaCha call can be suitably accelerated.
 init :: proc "contextless" (ctx: ^Context, key, iv: []byte, is_xchacha: bool) {
-	if len(key) != KEY_SIZE || len(iv) != IV_SIZE {
-		intrinsics.trap()
-	}
+	ensure_contextless(len(key) == KEY_SIZE, "chacha20: invalid key size")
+	ensure_contextless(len(iv) == IV_SIZE, "chacha20: invalid key size")

 	k, n := key, iv

@@ -76,12 +74,10 @@ init :: proc "contextless" (ctx: ^Context, key, iv: []byte, is_xchacha: bool) {

 // seek seeks the (X)ChaCha20 stream counter to the specified block.
 seek :: proc(ctx: ^Context, block_nr: u64) {
-	assert(ctx._is_initialized)
+	ensure(ctx._is_initialized)

 	if ctx._is_ietf_flavor {
-		if block_nr > MAX_CTR_IETF {
-			panic("crypto/chacha20: attempted to seek past maximum counter")
-		}
+		ensure(block_nr <= MAX_CTR_IETF, "crypto/chacha20: attempted to seek past maximum counter")
 	} else {
 		ctx._s[13] = u32(block_nr >> 32)
 	}
@@ -102,7 +98,7 @@ check_counter_limit :: proc(ctx: ^Context, nr_blocks: int) {
 	// Enforce the maximum consumed keystream per IV.
 	//
 	// While all modern "standard" definitions of ChaCha20 use
-	// the IETF 32-bit counter, for XChaCha20 most common
+	// the IETF 32-bit counter, for XChaCha20 historical
 	// implementations allow for a 64-bit counter.
 	//
 	// Honestly, the answer here is "use a MRAE primitive", but
@@ -110,14 +106,14 @@ check_counter_limit :: proc(ctx: ^Context, nr_blocks: int) {

 	ERR_CTR_EXHAUSTED :: "crypto/chacha20: maximum (X)ChaCha20 keystream per IV reached"

+	ctr_ok: bool
 	if ctx._is_ietf_flavor {
-		if u64(ctx._s[12]) + u64(nr_blocks) > MAX_CTR_IETF {
-			panic(ERR_CTR_EXHAUSTED)
-		}
+		ctr_ok = u64(ctx._s[12]) + u64(nr_blocks) <= MAX_CTR_IETF
 	} else {
 		ctr := (u64(ctx._s[13]) << 32) | u64(ctx._s[12])
-		if _, carry := bits.add_u64(ctr, u64(nr_blocks), 0); carry != 0 {
-			panic(ERR_CTR_EXHAUSTED)
-		}
+		_, carry := bits.add_u64(ctr, u64(nr_blocks), 0)
+		ctr_ok = carry == 0
 	}
+
+	ensure(ctr_ok, "crypto/chacha20: maximum (X)ChaCha20 keystream per IV reached")
 }
@@ -29,11 +29,24 @@ when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
 	// explicitly using simd.u8x16 shuffles.
 	@(private = "file")
 	TARGET_SIMD_FEATURES :: "sse2,ssse3"
+} else when ODIN_ARCH == .riscv64 {
+	@(private = "file")
+	TARGET_SIMD_FEATURES :: "v"
 } else {
 	@(private = "file")
 	TARGET_SIMD_FEATURES :: ""
 }

+// Some targets lack runtime feature detection, and will flat out refuse
+// to load binaries that have unknown instructions.  This is distinct from
+// `simd.IS_EMULATED` as actually good designs support runtime feature
+// detection and that constant establishes a baseline.
+//
+// See:
+// - https://github.com/WebAssembly/design/issues/1161
+@(private = "file")
+TARGET_IS_DESIGNED_BY_IDIOTS :: (ODIN_ARCH == .wasm64p32 || ODIN_ARCH == .wasm32) && !intrinsics.has_target_feature("simd128")
+
@(private = "file")
 _ROT_7L: simd.u32x4 : {7, 7, 7, 7}
@(private = "file")
@@ -205,11 +218,13 @@ _store_simd128 :: #force_inline proc "contextless" (
 // is_performant returns true iff the target and current host both support
 // "enough" 128-bit SIMD to make this implementation performant.
 is_performant :: proc "contextless" () -> bool {
-	when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 || ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 {
+	when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 || ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 || ODIN_ARCH == .riscv64 {
 		when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
 			req_features :: info.CPU_Features{.asimd}
 		} else when ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 {
 			req_features :: info.CPU_Features{.sse2, .ssse3}
+		} else when ODIN_ARCH == .riscv64 {
+			req_features :: info.CPU_Features{.V}
 		}

 		features, ok := info.cpu_features.?
@@ -245,8 +260,17 @@ stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int)

 	// 8 blocks at a time.
 	//
-	// Note: This is only worth it on Aarch64.
-	when ODIN_ARCH == .arm64 {
+	// Note:
+	// This uses a ton of registers so it is only worth it on targets
+	// that have something like 32 128-bit registers.  This is currently
+	// all ARMv8 targets, and RISC-V Zvl128b (`V` application profile)
+	// targets.
+	//
+	// While our current definition of `.arm32` is 32-bit ARMv8, this
+	// may change in the future (ARMv7 is still relevant), and things
+	// like Cortex-A8/A9 does "pretend" 128-bit SIMD 64-bits at a time
+	// thus needs bemchmarking.
+	when ODIN_ARCH == .arm64 || ODIN_ARCH == .riscv64 {
 		for ; n >= 8; n = n - 8 {
 			v0, v1, v2, v3 := s0, s1, s2, s3

@@ -354,9 +378,11 @@ stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int)

 	// 4 blocks at a time.
 	//
-	// Note: The i386 target lacks the required number of registers
-	// for this to be performant, so it is skipped.
-	when ODIN_ARCH != .i386 {
+	// Note: This is skipped on several targets for various reasons.
+	// - i386 lacks the required number of registers
+	// - Generating code when runtime "hardware" SIMD support is impossible
+	//   to detect is pointless, since this will be emulated using GP regs.
+	when ODIN_ARCH != .i386 && !TARGET_IS_DESIGNED_BY_IDIOTS {
 		for ; n >= 4; n = n - 4 {
 			v0, v1, v2, v3 := s0, s1, s2, s3

@@ -13,5 +13,5 @@ stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int)
 }

 hchacha20 :: proc "contextless" (dst, key, iv: []byte) {
-	intrinsics.trap()
+	panic_contextless("crypto/chacha20: simd256 implementation unsupported")
 }
@@ -11,7 +11,6 @@ See:
 - https://www.hyperelliptic.org/EFD/g1p/auto-twisted-extended-1.html
 */

-import "base:intrinsics"
 import "core:crypto"
 import field "core:crypto/_fiat/field_curve25519"
 import "core:mem"
@@ -32,6 +31,7 @@ import "core:mem"
 // - The group element decoding routine takes the opinionated stance of
 //   rejecting non-canonical encodings.

+@(rodata)
 FE_D := field.Tight_Field_Element {
 	929955233495203,
 	466365720129213,
@@ -39,7 +39,7 @@ FE_D := field.Tight_Field_Element {
 	2033849074728123,
 	1442794654840575,
 }
-@(private)
+@(private, rodata)
 FE_A := field.Tight_Field_Element {
 	2251799813685228,
 	2251799813685247,
@@ -47,7 +47,7 @@ FE_A := field.Tight_Field_Element {
 	2251799813685247,
 	2251799813685247,
 }
-@(private)
+@(private, rodata)
 FE_D2 := field.Tight_Field_Element {
 	1859910466990425,
 	932731440258426,
@@ -55,7 +55,7 @@ FE_D2 := field.Tight_Field_Element {
 	1815898335770999,
 	633789495995903,
 }
-@(private)
+@(private, rodata)
 GE_BASEPOINT := Group_Element {
 	field.Tight_Field_Element {
 		1738742601995546,
@@ -80,6 +80,7 @@ GE_BASEPOINT := Group_Element {
 		1821297809914039,
 	},
 }
+@(rodata)
 GE_IDENTITY := Group_Element {
 	field.Tight_Field_Element{0, 0, 0, 0, 0},
 	field.Tight_Field_Element{1, 0, 0, 0, 0},
@@ -107,9 +108,7 @@ ge_set :: proc "contextless" (ge, a: ^Group_Element) {

@(require_results)
 ge_set_bytes :: proc "contextless" (ge: ^Group_Element, b: []byte) -> bool {
-	if len(b) != 32 {
-		intrinsics.trap()
-	}
+	ensure_contextless(len(b) == 32, "edwards25519: invalid group element size")
 	b_ := (^[32]byte)(raw_data(b))

 	// Do the work in a scratch element, so that ge is unchanged on
@@ -166,9 +165,7 @@ ge_set_bytes :: proc "contextless" (ge: ^Group_Element, b: []byte) -> bool {
 }

 ge_bytes :: proc "contextless" (ge: ^Group_Element, dst: []byte) {
-	if len(dst) != 32 {
-		intrinsics.trap()
-	}
+	ensure_contextless(len(dst) == 32, "edwards25519: invalid group element size")
 	dst_ := (^[32]byte)(raw_data(dst))

 	// Convert the element to affine (x, y) representation.
@@ -1,6 +1,5 @@
 package _edwards25519

-import "base:intrinsics"
 import field "core:crypto/_fiat/field_scalar25519"
 import "core:mem"

@@ -8,7 +7,7 @@ Scalar :: field.Montgomery_Domain_Field_Element

 // WARNING: This is non-canonical and only to be used when checking if
 // a group element is on the prime-order subgroup.
-@(private)
+@(private, rodata)
 SC_ELL := field.Non_Montgomery_Domain_Field_Element {
 	field.ELL[0],
 	field.ELL[1],
@@ -25,17 +24,13 @@ sc_set_u64 :: proc "contextless" (sc: ^Scalar, i: u64) {

@(require_results)
 sc_set_bytes :: proc "contextless" (sc: ^Scalar, b: []byte) -> bool {
-	if len(b) != 32 {
-		intrinsics.trap()
-	}
+	ensure_contextless(len(b) == 32, "edwards25519: invalid scalar size")
 	b_ := (^[32]byte)(raw_data(b))
 	return field.fe_from_bytes(sc, b_)
 }

 sc_set_bytes_rfc8032 :: proc "contextless" (sc: ^Scalar, b: []byte) {
-	if len(b) != 32 {
-		intrinsics.trap()
-	}
+	ensure_contextless(len(b) == 32, "edwards25519: invalid scalar size")
 	b_ := (^[32]byte)(raw_data(b))
 	field.fe_from_bytes_rfc8032(sc, b_)
 }
@@ -42,9 +42,12 @@ import "core:math/bits"
 Loose_Field_Element :: distinct [5]u64
 Tight_Field_Element :: distinct [5]u64

+@(rodata)
 FE_ZERO := Tight_Field_Element{0, 0, 0, 0, 0}
+@(rodata)
 FE_ONE := Tight_Field_Element{1, 0, 0, 0, 0}

+@(rodata)
 FE_SQRT_M1 := Tight_Field_Element {
 	1718705420411056,
 	234908883556509,
@@ -0,0 +1,235 @@
+package field_curve448
+
+import "core:mem"
+
+fe_relax_cast :: #force_inline proc "contextless" (
+	arg1: ^Tight_Field_Element,
+) -> ^Loose_Field_Element {
+	return (^Loose_Field_Element)(arg1)
+}
+
+fe_tighten_cast :: #force_inline proc "contextless" (
+	arg1: ^Loose_Field_Element,
+) -> ^Tight_Field_Element {
+	return (^Tight_Field_Element)(arg1)
+}
+
+fe_clear :: proc "contextless" (
+	arg1: $T,
+) where T == ^Tight_Field_Element || T == ^Loose_Field_Element {
+	mem.zero_explicit(arg1, size_of(arg1^))
+}
+
+fe_clear_vec :: proc "contextless" (
+	arg1: $T,
+) where T == []^Tight_Field_Element || T == []^Loose_Field_Element {
+	for fe in arg1 {
+		fe_clear(fe)
+	}
+}
+
+fe_carry_mul_small :: proc "contextless" (
+	out1: ^Tight_Field_Element,
+	arg1: ^Loose_Field_Element,
+	arg2: u64,
+) {
+	arg2_ := Loose_Field_Element{arg2, 0, 0, 0, 0, 0, 0, 0}
+	fe_carry_mul(out1, arg1, &arg2_)
+}
+
+fe_carry_pow2k :: proc "contextless" (
+	out1: ^Tight_Field_Element,
+	arg1: ^Loose_Field_Element,
+	arg2: uint,
+) {
+	// Special case: `arg1^(2 * 0) = 1`, though this should never happen.
+	if arg2 == 0 {
+		fe_one(out1)
+		return
+	}
+
+	fe_carry_square(out1, arg1)
+	for _ in 1 ..< arg2 {
+		fe_carry_square(out1, fe_relax_cast(out1))
+	}
+}
+
+fe_carry_inv :: proc "contextless" (
+	out1: ^Tight_Field_Element,
+	arg1: ^Loose_Field_Element,
+) {
+	// Inversion computation is derived from the addition chain:
+	//
+	//	_10     = 2*1
+	//	_11     = 1 + _10
+	//	_110    = 2*_11
+	//	_111    = 1 + _110
+	//	_111000 = _111 << 3
+	//	_111111 = _111 + _111000
+	//	x12     = _111111 << 6 + _111111
+	//	x24     = x12 << 12 + x12
+	//	i34     = x24 << 6
+	//	x30     = _111111 + i34
+	//	x48     = i34 << 18 + x24
+	//	x96     = x48 << 48 + x48
+	//	x192    = x96 << 96 + x96
+	//	x222    = x192 << 30 + x30
+	//	x223    = 2*x222 + 1
+	//	return    (x223 << 223 + x222) << 2 + 1
+	//
+	// Operations: 447 squares 13 multiplies
+	//
+	// Generated by github.com/mmcloughlin/addchain v0.4.0.
+
+	t0, t1, t2: Tight_Field_Element = ---, ---, ---
+
+	// Step 1: t0 = x^0x2
+	fe_carry_square(&t0, arg1)
+
+	// Step 2: t0 = x^0x3
+	fe_carry_mul(&t0, arg1, fe_relax_cast(&t0))
+
+	// t0.Sqr(t0)
+	fe_carry_square(&t0, fe_relax_cast(&t0))
+
+	// Step 4: t0 = x^0x7
+	fe_carry_mul(&t0, arg1, fe_relax_cast(&t0))
+
+	// Step 7: t1 = x^0x38
+	fe_carry_pow2k(&t1, fe_relax_cast(&t0), 3)
+
+	// Step 8: t0 = x^0x3f
+	fe_carry_mul(&t0, fe_relax_cast(&t0), fe_relax_cast(&t1))
+
+	// Step 14: t1 = x^0xfc0
+	fe_carry_pow2k(&t1, fe_relax_cast(&t0), 6)
+
+	// Step 15: t1 = x^0xfff
+	fe_carry_mul(&t1, fe_relax_cast(&t0), fe_relax_cast(&t1))
+
+	// Step 27: t2 = x^0xfff000
+	fe_carry_pow2k(&t2, fe_relax_cast(&t1), 12)
+
+	// Step 28: t1 = x^0xffffff
+	fe_carry_mul(&t1, fe_relax_cast(&t1), fe_relax_cast(&t2))
+
+	// Step 34: t2 = x^0x3fffffc0
+	fe_carry_pow2k(&t2, fe_relax_cast(&t1), 6)
+
+	// Step 35: t0 = x^0x3fffffff
+	fe_carry_mul(&t0, fe_relax_cast(&t0), fe_relax_cast(&t2))
+
+	// Step 53: t2 = x^0xffffff000000
+	fe_carry_pow2k(&t2, fe_relax_cast(&t2), 18)
+
+	// Step 54: t1 = x^0xffffffffffff
+	fe_carry_mul(&t1, fe_relax_cast(&t1), fe_relax_cast(&t2))
+
+	// Step 102: t2 = x^0xffffffffffff000000000000
+	fe_carry_pow2k(&t2, fe_relax_cast(&t1), 48)
+
+	// Step 103: t1 = x^0xffffffffffffffffffffffff
+	fe_carry_mul(&t1, fe_relax_cast(&t1), fe_relax_cast(&t2))
+
+	// Step 199: t2 = x^0xffffffffffffffffffffffff000000000000000000000000
+	fe_carry_pow2k(&t2, fe_relax_cast(&t1), 96)
+
+	// Step 200: t1 = x^0xffffffffffffffffffffffffffffffffffffffffffffffff
+	fe_carry_mul(&t1, fe_relax_cast(&t1), fe_relax_cast(&t2))
+
+	// Step 230: t1 = x^0x3fffffffffffffffffffffffffffffffffffffffffffffffc0000000
+	fe_carry_pow2k(&t1, fe_relax_cast(&t1), 30)
+
+	// Step 231: t0 = x^0x3fffffffffffffffffffffffffffffffffffffffffffffffffffffff
+	fe_carry_mul(&t0, fe_relax_cast(&t0), fe_relax_cast(&t1))
+
+	// Step 232: t1 = x^0x7ffffffffffffffffffffffffffffffffffffffffffffffffffffffe
+	fe_carry_square(&t1, fe_relax_cast(&t0))
+
+	// Step 233: t1 = x^0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffff
+	fe_carry_mul(&t1, arg1, fe_relax_cast(&t1))
+
+	// Step 456: t1 = x^0x3fffffffffffffffffffffffffffffffffffffffffffffffffffffff80000000000000000000000000000000000000000000000000000000
+	fe_carry_pow2k(&t1, fe_relax_cast(&t1), 223)
+
+	// Step 457: t0 = x^0x3fffffffffffffffffffffffffffffffffffffffffffffffffffffffbfffffffffffffffffffffffffffffffffffffffffffffffffffffff
+	fe_carry_mul(&t0, fe_relax_cast(&t0), fe_relax_cast(&t1))
+
+	// Step 459: t0 = x^0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffffffffffffffffffffffffffffffffffffffffffffffffffffc
+	fe_carry_pow2k(&t0, fe_relax_cast(&t0), 2)
+
+	// Step 460: z = x^0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffffffffffffffffffffffffffffffffffffffffffffffffffffd
+	fe_carry_mul(out1, arg1, fe_relax_cast(&t0))
+
+	fe_clear_vec([]^Tight_Field_Element{&t0, &t1, &t2})
+}
+
+fe_zero :: proc "contextless" (out1: ^Tight_Field_Element) {
+	out1[0] = 0
+	out1[1] = 0
+	out1[2] = 0
+	out1[3] = 0
+	out1[4] = 0
+	out1[5] = 0
+	out1[6] = 0
+	out1[7] = 0
+}
+
+fe_one :: proc "contextless" (out1: ^Tight_Field_Element) {
+	out1[0] = 1
+	out1[1] = 0
+	out1[2] = 0
+	out1[3] = 0
+	out1[4] = 0
+	out1[5] = 0
+	out1[6] = 0
+	out1[7] = 0
+}
+
+fe_set :: proc "contextless" (out1, arg1: ^Tight_Field_Element) {
+	x1 := arg1[0]
+	x2 := arg1[1]
+	x3 := arg1[2]
+	x4 := arg1[3]
+	x5 := arg1[4]
+	x6 := arg1[5]
+	x7 := arg1[6]
+	x8 := arg1[7]
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+	out1[4] = x5
+	out1[5] = x6
+	out1[6] = x7
+	out1[7] = x8
+}
+
+@(optimization_mode = "none")
+fe_cond_swap :: #force_no_inline proc "contextless" (out1, out2: ^Tight_Field_Element, arg1: int) {
+	mask := (u64(arg1) * 0xffffffffffffffff)
+	x := (out1[0] ~ out2[0]) & mask
+	x1, y1 := out1[0] ~ x, out2[0] ~ x
+	x = (out1[1] ~ out2[1]) & mask
+	x2, y2 := out1[1] ~ x, out2[1] ~ x
+	x = (out1[2] ~ out2[2]) & mask
+	x3, y3 := out1[2] ~ x, out2[2] ~ x
+	x = (out1[3] ~ out2[3]) & mask
+	x4, y4 := out1[3] ~ x, out2[3] ~ x
+	x = (out1[4] ~ out2[4]) & mask
+	x5, y5 := out1[4] ~ x, out2[4] ~ x
+	x = (out1[5] ~ out2[5]) & mask
+	x6, y6 := out1[5] ~ x, out2[5] ~ x
+	x = (out1[6] ~ out2[6]) & mask
+	x7, y7 := out1[6] ~ x, out2[6] ~ x
+	x = (out1[7] ~ out2[7]) & mask
+	x8, y8 := out1[7] ~ x, out2[7] ~ x
+	out1[0], out2[0] = x1, y1
+	out1[1], out2[1] = x2, y2
+	out1[2], out2[2] = x3, y3
+	out1[3], out2[3] = x4, y4
+	out1[4], out2[4] = x5, y5
+	out1[5], out2[5] = x6, y6
+	out1[6], out2[6] = x7, y7
+	out1[7], out2[7] = x8, y8
+}
@@ -1,6 +1,5 @@
 package field_poly1305

-import "base:intrinsics"
 import "core:encoding/endian"
 import "core:mem"

@@ -29,9 +28,7 @@ fe_from_bytes :: #force_inline proc "contextless" (
 	// makes implementing the actual MAC block processing considerably
 	// neater.

-	if len(arg1) != 16 {
-		intrinsics.trap()
-	}
+	ensure_contextless(len(arg1) == 16, "poly1305: invalid field element size")

 	// While it may be unwise to do deserialization here on our
 	// own when fiat-crypto provides equivalent functionality,
@@ -1,18 +1,17 @@
 package field_scalar25519

-import "base:intrinsics"
 import "core:encoding/endian"
 import "core:math/bits"
 import "core:mem"

-@(private)
+@(private, rodata)
 _TWO_168 := Montgomery_Domain_Field_Element {
 	0x5b8ab432eac74798,
 	0x38afddd6de59d5d7,
 	0xa2c131b399411b7c,
 	0x6329a7ed9ce5a30,
 }
-@(private)
+@(private, rodata)
 _TWO_336 := Montgomery_Domain_Field_Element {
 	0xbd3d108e2b35ecc5,
 	0x5c3a3718bdf9c90b,
@@ -95,9 +94,8 @@ fe_from_bytes_wide :: proc "contextless" (
@(private)
 _fe_from_bytes_short :: proc "contextless" (out1: ^Montgomery_Domain_Field_Element, arg1: []byte) {
 	// INVARIANT: len(arg1) < 32.
-	if len(arg1) >= 32 {
-		intrinsics.trap()
-	}
+	ensure_contextless(len(arg1) < 32, "edwards25519: oversized short scalar")
+
 	tmp: [32]byte
 	copy(tmp[:], arg1)

@@ -106,9 +104,7 @@ _fe_from_bytes_short :: proc "contextless" (out1: ^Montgomery_Domain_Field_Eleme
 }

 fe_to_bytes :: proc "contextless" (out1: []byte, arg1: ^Montgomery_Domain_Field_Element) {
-	if len(out1) != 32 {
-		intrinsics.trap()
-	}
+	ensure_contextless(len(out1) == 32, "edwards25519: oversized scalar output buffer")

 	tmp: Non_Montgomery_Domain_Field_Element
 	fe_from_montgomery(&tmp, arg1)
@@ -44,7 +44,7 @@ Context :: struct {
 	is_finalized:   bool, // For SHAKE (unlimited squeeze is allowed)
 }

-@(private)
+@(private, rodata)
 keccakf_rndc := [?]u64 {
 	0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
 	0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
@@ -56,13 +56,13 @@ keccakf_rndc := [?]u64 {
 	0x8000000000008080, 0x0000000080000001, 0x8000000080008008,
 }

-@(private)
+@(private, rodata)
 keccakf_rotc := [?]int {
 	1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
 	27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44,
 }

-@(private)
+@(private, rodata)
 keccakf_piln := [?]i32 {
 	10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
 	15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1,
@@ -122,7 +122,7 @@ keccakf :: proc "contextless" (st: ^[25]u64) {
 	}
 }

-init :: proc(ctx: ^Context) {
+init :: proc "contextless" (ctx: ^Context) {
 	for i := 0; i < 25; i += 1 {
 		ctx.st.q[i] = 0
 	}
@@ -133,9 +133,9 @@ init :: proc(ctx: ^Context) {
 	ctx.is_finalized = false
 }

-update :: proc(ctx: ^Context, data: []byte) {
-	assert(ctx.is_initialized)
-	assert(!ctx.is_finalized)
+update :: proc "contextless" (ctx: ^Context, data: []byte) {
+	ensure_contextless(ctx.is_initialized)
+	ensure_contextless(!ctx.is_finalized)

 	j := ctx.pt
 	for i := 0; i < len(data); i += 1 {
@@ -149,12 +149,9 @@ update :: proc(ctx: ^Context, data: []byte) {
 	ctx.pt = j
 }

-final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
-	assert(ctx.is_initialized)
-
-	if len(hash) < ctx.mdlen {
-		panic("crypto/sha3: invalid destination digest size")
-	}
+final :: proc "contextless" (ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
+	ensure_contextless(ctx.is_initialized)
+	ensure_contextless(len(hash) >= ctx.mdlen, "crypto/sha3: invalid destination digest size")

 	ctx := ctx
 	if finalize_clone {
@@ -173,11 +170,11 @@ final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
 	}
 }

-clone :: proc(ctx, other: ^Context) {
+clone :: proc "contextless" (ctx, other: ^Context) {
 	ctx^ = other^
 }

-reset :: proc(ctx: ^Context) {
+reset :: proc "contextless" (ctx: ^Context) {
 	if !ctx.is_initialized {
 		return
 	}
@@ -185,9 +182,9 @@ reset :: proc(ctx: ^Context) {
 	mem.zero_explicit(ctx, size_of(ctx^))
 }

-shake_xof :: proc(ctx: ^Context) {
-	assert(ctx.is_initialized)
-	assert(!ctx.is_finalized)
+shake_xof :: proc "contextless" (ctx: ^Context) {
+	ensure_contextless(ctx.is_initialized)
+	ensure_contextless(!ctx.is_finalized)

 	ctx.st.b[ctx.pt] ~= ctx.dsbyte
 	ctx.st.b[ctx.rsiz - 1] ~= 0x80
@@ -197,9 +194,9 @@ shake_xof :: proc(ctx: ^Context) {
 	ctx.is_finalized = true // No more absorb, unlimited squeeze.
 }

-shake_out :: proc(ctx: ^Context, hash: []byte) {
-	assert(ctx.is_initialized)
-	assert(ctx.is_finalized)
+shake_out :: proc "contextless" (ctx: ^Context, hash: []byte) {
+	ensure_contextless(ctx.is_initialized)
+	ensure_contextless(ctx.is_finalized)

 	j := ctx.pt
 	for i := 0; i < len(hash); i += 1 {
@@ -3,7 +3,7 @@ package _sha3
 import "core:encoding/endian"
 import "core:math/bits"

-init_cshake :: proc(ctx: ^Context, n, s: []byte, sec_strength: int) {
+init_cshake :: proc "contextless" (ctx: ^Context, n, s: []byte, sec_strength: int) {
 	ctx.mdlen = sec_strength / 8

 	// No domain separator is equivalent to vanilla SHAKE.
@@ -18,7 +18,7 @@ init_cshake :: proc(ctx: ^Context, n, s: []byte, sec_strength: int) {
 	bytepad(ctx, [][]byte{n, s}, rate_cshake(sec_strength))
 }

-final_cshake :: proc(ctx: ^Context, dst: []byte, finalize_clone: bool = false) {
+final_cshake :: proc "contextless" (ctx: ^Context, dst: []byte, finalize_clone: bool = false) {
 	ctx := ctx
 	if finalize_clone {
 		tmp_ctx: Context
@@ -32,7 +32,7 @@ final_cshake :: proc(ctx: ^Context, dst: []byte, finalize_clone: bool = false) {
 	shake_out(ctx, dst)
 }

-rate_cshake :: #force_inline proc(sec_strength: int) -> int {
+rate_cshake :: #force_inline proc "contextless" (sec_strength: int) -> int {
 	switch sec_strength {
 	case 128:
 		return RATE_128
@@ -40,7 +40,7 @@ rate_cshake :: #force_inline proc(sec_strength: int) -> int {
 		return RATE_256
 	}

-	panic("crypto/sha3: invalid security strength")
+	panic_contextless("crypto/sha3: invalid security strength")
 }

 // right_encode and left_encode are defined to support 0 <= x < 2^2040
@@ -52,10 +52,10 @@ rate_cshake :: #force_inline proc(sec_strength: int) -> int {
 //
 // Thus we support 0 <= x < 2^128.

-@(private)
+@(private, rodata)
 _PAD: [RATE_128]byte // Biggest possible value of w per spec.

-bytepad :: proc(ctx: ^Context, x_strings: [][]byte, w: int) {
+bytepad :: proc "contextless" (ctx: ^Context, x_strings: [][]byte, w: int) {
 	// 1. z = left_encode(w) || X.
 	z_hi: u64
 	z_lo := left_right_encode(ctx, 0, u64(w), true)
@@ -70,9 +70,7 @@ bytepad :: proc(ctx: ^Context, x_strings: [][]byte, w: int) {

 		// This isn't actually possible, at least with the currently
 		// defined SP 800-185 routines.
-		if carry != 0 {
-			panic("crypto/sha3: bytepad input length overflow")
-		}
+		ensure_contextless(carry == 0, "crypto/sha3: bytepad input length overflow")
 	}

 	// We skip this step as we are doing a byte-oriented implementation
@@ -95,7 +93,7 @@ bytepad :: proc(ctx: ^Context, x_strings: [][]byte, w: int) {
 	}
 }

-encode_string :: #force_inline proc(ctx: ^Context, s: []byte) -> (u64, u64) {
+encode_string :: #force_inline proc "contextless" (ctx: ^Context, s: []byte) -> (u64, u64) {
 	l := encode_byte_len(ctx, len(s), true) // left_encode
 	update(ctx, s)

@@ -104,13 +102,13 @@ encode_string :: #force_inline proc(ctx: ^Context, s: []byte) -> (u64, u64) {
 	return hi, lo
 }

-encode_byte_len :: #force_inline proc(ctx: ^Context, l: int, is_left: bool) -> u64 {
+encode_byte_len :: #force_inline proc "contextless" (ctx: ^Context, l: int, is_left: bool) -> u64 {
 	hi, lo := bits.mul_u64(u64(l), 8)
 	return left_right_encode(ctx, hi, lo, is_left)
 }

@(private)
-left_right_encode :: proc(ctx: ^Context, hi, lo: u64, is_left: bool) -> u64 {
+left_right_encode :: proc "contextless" (ctx: ^Context, hi, lo: u64, is_left: bool) -> u64 {
 	HI_OFFSET :: 1
 	LO_OFFSET :: HI_OFFSET + 8
 	RIGHT_OFFSET :: LO_OFFSET + 8
@@ -16,7 +16,7 @@ seal_oneshot :: proc(algo: Algorithm, dst, tag, key, iv, aad, plaintext: []byte,
 // returning true iff the authentication was successful.  If authentication
 // fails, the destination buffer will be zeroed.
 //
-// dst and plaintext MUST alias exactly or not at all.
+// dst and ciphertext MUST alias exactly or not at all.
@(require_results)
 open_oneshot :: proc(algo: Algorithm, dst, key, iv, aad, ciphertext, tag: []byte, impl: Implementation = nil) -> bool {
 	ctx: Context
@@ -1,8 +1,10 @@
 package aead

+import "core:crypto/aegis"
 import "core:crypto/aes"
 import "core:crypto/chacha20"
 import "core:crypto/chacha20poly1305"
+import "core:crypto/deoxysii"
 import "core:reflect"

 // Implementation is an AEAD implementation.  Most callers will not need
@@ -15,7 +17,7 @@ Implementation :: union {

 // MAX_TAG_SIZE is the maximum size tag that can be returned by any of the
 // Algorithms supported via this package.
-MAX_TAG_SIZE :: 16
+MAX_TAG_SIZE :: 32

 // Algorithm is the algorithm identifier associated with a given Context.
 Algorithm :: enum {
@@ -25,9 +27,14 @@ Algorithm :: enum {
 	AES_GCM_256,
 	CHACHA20POLY1305,
 	XCHACHA20POLY1305,
+	AEGIS_128L,
+	AEGIS_128L_256, // AEGIS-128L (256-bit tag)
+	AEGIS_256,
+	AEGIS_256_256, // AEGIS-256 (256-bit tag)
+	DEOXYS_II_256,
 }

-// ALGORITM_NAMES is the Agorithm to algorithm name string.
+// ALGORITM_NAMES is the Algorithm to algorithm name string.
 ALGORITHM_NAMES := [Algorithm]string {
 	.Invalid           = "Invalid",
 	.AES_GCM_128       = "AES-GCM-128",
@@ -35,6 +42,11 @@ ALGORITHM_NAMES := [Algorithm]string {
 	.AES_GCM_256       = "AES-GCM-256",
 	.CHACHA20POLY1305  = "chacha20poly1305",
 	.XCHACHA20POLY1305 = "xchacha20poly1305",
+	.AEGIS_128L        = "AEGIS-128L",
+	.AEGIS_128L_256    = "AEGIS-128L-256",
+	.AEGIS_256         = "AEGIS-256",
+	.AEGIS_256_256     = "AEGIS-256-256",
+	.DEOXYS_II_256     = "Deoxys-II-256",
 }

 // TAG_SIZES is the Algorithm to tag size in bytes.
@@ -45,6 +57,11 @@ TAG_SIZES := [Algorithm]int {
 	.AES_GCM_256       = aes.GCM_TAG_SIZE,
 	.CHACHA20POLY1305  = chacha20poly1305.TAG_SIZE,
 	.XCHACHA20POLY1305 = chacha20poly1305.TAG_SIZE,
+	.AEGIS_128L        = aegis.TAG_SIZE_128,
+	.AEGIS_128L_256    = aegis.TAG_SIZE_256,
+	.AEGIS_256         = aegis.TAG_SIZE_128,
+	.AEGIS_256_256     = aegis.TAG_SIZE_256,
+	.DEOXYS_II_256     = deoxysii.TAG_SIZE,
 }

 // KEY_SIZES is the Algorithm to key size in bytes.
@@ -55,6 +72,11 @@ KEY_SIZES := [Algorithm]int {
 	.AES_GCM_256       = aes.KEY_SIZE_256,
 	.CHACHA20POLY1305  = chacha20poly1305.KEY_SIZE,
 	.XCHACHA20POLY1305 = chacha20poly1305.KEY_SIZE,
+	.AEGIS_128L        = aegis.KEY_SIZE_128L,
+	.AEGIS_128L_256    = aegis.KEY_SIZE_128L,
+	.AEGIS_256         = aegis.KEY_SIZE_256,
+	.AEGIS_256_256     = aegis.KEY_SIZE_256,
+	.DEOXYS_II_256     = deoxysii.KEY_SIZE,
 }

 // IV_SIZES is the Algorithm to initialization vector size in bytes.
@@ -67,6 +89,11 @@ IV_SIZES := [Algorithm]int {
 	.AES_GCM_256       = aes.GCM_IV_SIZE,
 	.CHACHA20POLY1305  = chacha20poly1305.IV_SIZE,
 	.XCHACHA20POLY1305 = chacha20poly1305.XIV_SIZE,
+	.AEGIS_128L        = aegis.IV_SIZE_128L,
+	.AEGIS_128L_256    = aegis.IV_SIZE_128L,
+	.AEGIS_256         = aegis.IV_SIZE_256,
+	.AEGIS_256_256     = aegis.IV_SIZE_256,
+	.DEOXYS_II_256     = deoxysii.IV_SIZE,
 }

 // Context is a concrete instantiation of a specific AEAD algorithm.
@@ -75,6 +102,8 @@ Context :: struct {
 	_impl: union {
 		aes.Context_GCM,
 		chacha20poly1305.Context,
+		aegis.Context,
+		deoxysii.Context,
 	},
 }

@@ -86,6 +115,11 @@ _IMPL_IDS := [Algorithm]typeid {
 	.AES_GCM_256       = typeid_of(aes.Context_GCM),
 	.CHACHA20POLY1305  = typeid_of(chacha20poly1305.Context),
 	.XCHACHA20POLY1305 = typeid_of(chacha20poly1305.Context),
+	.AEGIS_128L        = typeid_of(aegis.Context),
+	.AEGIS_128L_256    = typeid_of(aegis.Context),
+	.AEGIS_256         = typeid_of(aegis.Context),
+	.AEGIS_256_256     = typeid_of(aegis.Context),
+	.DEOXYS_II_256     = typeid_of(deoxysii.Context),
 }

 // init initializes a Context with a specific AEAD Algorithm.
@@ -94,9 +128,7 @@ init :: proc(ctx: ^Context, algorithm: Algorithm, key: []byte, impl: Implementat
 		reset(ctx)
 	}

-	if len(key) != KEY_SIZES[algorithm] {
-		panic("crypto/aead: invalid key size")
-	}
+	ensure(len(key) == KEY_SIZES[algorithm], "crypto/aead: invalid key size")

 	// Directly specialize the union by setting the type ID (save a copy).
 	reflect.set_union_variant_typeid(
@@ -113,6 +145,12 @@ init :: proc(ctx: ^Context, algorithm: Algorithm, key: []byte, impl: Implementat
 	case .XCHACHA20POLY1305:
 		impl_ := impl != nil ? impl.(chacha20.Implementation) : chacha20.DEFAULT_IMPLEMENTATION
 		chacha20poly1305.init_xchacha(&ctx._impl.(chacha20poly1305.Context), key, impl_)
+	case .AEGIS_128L, .AEGIS_128L_256, .AEGIS_256, .AEGIS_256_256:
+		impl_ := impl != nil ? impl.(aes.Implementation) : aes.DEFAULT_IMPLEMENTATION
+		aegis.init(&ctx._impl.(aegis.Context), key, impl_)
+	case .DEOXYS_II_256:
+		impl_ := impl != nil ? impl.(aes.Implementation) : aes.DEFAULT_IMPLEMENTATION
+		deoxysii.init(&ctx._impl.(deoxysii.Context), key, impl_)
 	case .Invalid:
 		panic("crypto/aead: uninitialized algorithm")
 	case:
@@ -127,11 +165,17 @@ init :: proc(ctx: ^Context, algorithm: Algorithm, key: []byte, impl: Implementat
 //
 // dst and plaintext MUST alias exactly or not at all.
 seal_ctx :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {
+	ensure(len(tag) == TAG_SIZES[ctx._algo], "crypto/aead: invalid tag size")
+
 	switch &impl in ctx._impl {
 	case aes.Context_GCM:
 		aes.seal_gcm(&impl, dst, tag, iv, aad, plaintext)
 	case chacha20poly1305.Context:
 		chacha20poly1305.seal(&impl, dst, tag, iv, aad, plaintext)
+	case aegis.Context:
+		aegis.seal(&impl, dst, tag, iv, aad, plaintext)
+	case deoxysii.Context:
+		deoxysii.seal(&impl, dst, tag, iv, aad, plaintext)
 	case:
 		panic("crypto/aead: uninitialized algorithm")
 	}
@@ -145,11 +189,17 @@ seal_ctx :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {
 // dst and plaintext MUST alias exactly or not at all.
@(require_results)
 open_ctx :: proc(ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool {
+	ensure(len(tag) == TAG_SIZES[ctx._algo], "crypto/aead: invalid tag size")
+
 	switch &impl in ctx._impl {
 	case aes.Context_GCM:
 		return aes.open_gcm(&impl, dst, iv, aad, ciphertext, tag)
 	case chacha20poly1305.Context:
 		return chacha20poly1305.open(&impl, dst, iv, aad, ciphertext, tag)
+	case aegis.Context:
+		return aegis.open(&impl, dst, iv, aad, ciphertext, tag)
+	case deoxysii.Context:
+		return deoxysii.open(&impl, dst, iv, aad, ciphertext, tag)
 	case:
 		panic("crypto/aead: uninitialized algorithm")
 	}
@@ -163,6 +213,10 @@ reset :: proc(ctx: ^Context) {
 		aes.reset_gcm(&impl)
 	case chacha20poly1305.Context:
 		chacha20poly1305.reset(&impl)
+	case aegis.Context:
+		aegis.reset(&impl)
+	case deoxysii.Context:
+		deoxysii.reset(&impl)
 	case:
 		// Calling reset repeatedly is fine.
 	}
@@ -0,0 +1,213 @@
+/*
+package aegis implements the AEGIS-128L and AEGIS-256 Authenticated
+Encryption with Additional Data algorithms.
+
+See:
+- [[ https://www.ietf.org/archive/id/draft-irtf-cfrg-aegis-aead-12.txt ]]
+*/
+package aegis
+
+import "core:bytes"
+import "core:crypto"
+import "core:crypto/aes"
+import "core:mem"
+
+// KEY_SIZE_128L is the AEGIS-128L key size in bytes.
+KEY_SIZE_128L :: 16
+// KEY_SIZE_256 is the AEGIS-256 key size in bytes.
+KEY_SIZE_256 :: 32
+// IV_SIZE_128L is the AEGIS-128L IV size in bytes.
+IV_SIZE_128L :: 16
+// IV_SIZE_256 is the AEGIS-256 IV size in bytes.
+IV_SIZE_256 :: 32
+// TAG_SIZE_128 is the AEGIS-128L or AEGIS-256 128-bit tag size in bytes.
+TAG_SIZE_128 :: 16
+// TAG_SIZE_256 is the AEGIS-128L or AEGIS-256 256-bit tag size in bytes.
+TAG_SIZE_256 :: 32
+
+@(private)
+_RATE_128L :: 32
+@(private)
+_RATE_256 :: 16
+@(private)
+_RATE_MAX :: _RATE_128L
+
+@(private, rodata)
+_C0 := [16]byte{
+	0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d,
+	0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62,
+}
+
+@(private, rodata)
+_C1 := [16]byte {
+	0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1,
+	0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd,
+}
+
+// Context is a keyed AEGIS-128L or AEGIS-256 instance.
+Context :: struct {
+	_key:            [KEY_SIZE_256]byte,
+	_key_len:        int,
+	_impl:           aes.Implementation,
+	_is_initialized: bool,
+}
+
+@(private)
+_validate_common_slice_sizes :: proc (ctx: ^Context, tag, iv, aad, text: []byte) {
+	switch len(tag) {
+	case TAG_SIZE_128, TAG_SIZE_256:
+	case:
+		panic("crypto/aegis: invalid tag size")
+	}
+
+	iv_ok: bool
+	switch ctx._key_len {
+	case KEY_SIZE_128L:
+		iv_ok = len(iv) == IV_SIZE_128L
+	case KEY_SIZE_256:
+		iv_ok = len(iv) == IV_SIZE_256
+	}
+	ensure(iv_ok,"crypto/aegis: invalid IV size")
+
+	#assert(size_of(int) == 8 || size_of(int) <= 4)
+	// As A_MAX and P_MAX are both defined to be 2^61 - 1 bytes, and
+	// the maximum length of a slice is bound by `size_of(int)`, where
+	// `int` is register sized, there is no need to check AAD/text
+	// lengths.
+}
+
+// init initializes a Context with the provided key, for AEGIS-128L or AEGIS-256.
+init :: proc(ctx: ^Context, key: []byte, impl := aes.DEFAULT_IMPLEMENTATION) {
+	switch len(key) {
+	case KEY_SIZE_128L, KEY_SIZE_256:
+	case:
+		panic("crypto/aegis: invalid key size")
+	}
+
+	copy(ctx._key[:], key)
+	ctx._key_len = len(key)
+	ctx._impl = impl
+	if ctx._impl == .Hardware && !is_hardware_accelerated() {
+		ctx._impl = .Portable
+	}
+	ctx._is_initialized = true
+}
+
+// seal encrypts the plaintext and authenticates the aad and ciphertext,
+// with the provided Context and iv, stores the output in dst and tag.
+//
+// dst and plaintext MUST alias exactly or not at all.
+seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {
+	ensure(ctx._is_initialized)
+
+	_validate_common_slice_sizes(ctx, tag, iv, aad, plaintext)
+	ensure(len(dst) == len(plaintext), "crypto/aegis: invalid destination ciphertext size")
+	ensure(!bytes.alias_inexactly(dst, plaintext), "crypto/aegis: dst and plaintext alias inexactly")
+
+	switch ctx._impl {
+	case .Hardware:
+		st: State_HW
+		defer reset_state_hw(&st)
+
+		init_hw(ctx, &st, iv)
+
+		aad_len, pt_len := len(aad), len(plaintext)
+		if aad_len > 0 {
+			absorb_hw(&st, aad)
+		}
+
+		if pt_len > 0 {
+			enc_hw(&st, dst, plaintext)
+		}
+
+		finalize_hw(&st, tag, aad_len, pt_len)
+	case .Portable:
+		st: State_SW
+		defer reset_state_sw(&st)
+
+		init_sw(ctx, &st, iv)
+
+		aad_len, pt_len := len(aad), len(plaintext)
+		if aad_len > 0 {
+			absorb_sw(&st, aad)
+		}
+
+		if pt_len > 0 {
+			enc_sw(&st, dst, plaintext)
+		}
+
+		finalize_sw(&st, tag, aad_len, pt_len)
+	case:
+		panic("core/crypto/aegis: not implemented")
+	}
+}
+
+// open authenticates the aad and ciphertext, and decrypts the ciphertext,
+// with the provided Context, iv, and tag, and stores the output in dst,
+// returning true iff the authentication was successful.  If authentication
+// fails, the destination buffer will be zeroed.
+//
+// dst and plaintext MUST alias exactly or not at all.
+@(require_results)
+open :: proc(ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool {
+	ensure(ctx._is_initialized)
+
+	_validate_common_slice_sizes(ctx, tag, iv, aad, ciphertext)
+	ensure(len(dst) == len(ciphertext), "crypto/aegis: invalid destination plaintext size")
+	ensure(!bytes.alias_inexactly(dst, ciphertext), "crypto/aegis: dst and ciphertext alias inexactly")
+
+	tmp: [TAG_SIZE_256]byte
+	derived_tag := tmp[:len(tag)]
+	aad_len, ct_len := len(aad), len(ciphertext)
+
+	switch ctx._impl {
+	case .Hardware:
+		st: State_HW
+		defer reset_state_hw(&st)
+
+		init_hw(ctx, &st, iv)
+
+		if aad_len > 0 {
+			absorb_hw(&st, aad)
+		}
+
+		if ct_len > 0 {
+			dec_hw(&st, dst, ciphertext)
+		}
+
+		finalize_hw(&st, derived_tag, aad_len, ct_len)
+	case .Portable:
+		st: State_SW
+		defer reset_state_sw(&st)
+
+		init_sw(ctx, &st, iv)
+
+		if aad_len > 0 {
+			absorb_sw(&st, aad)
+		}
+
+		if ct_len > 0 {
+			dec_sw(&st, dst, ciphertext)
+		}
+
+		finalize_sw(&st, derived_tag, aad_len, ct_len)
+	case:
+		panic("core/crypto/aegis: not implemented")
+	}
+
+	if crypto.compare_constant_time(tag, derived_tag) != 1 {
+		mem.zero_explicit(raw_data(derived_tag), len(derived_tag))
+		mem.zero_explicit(raw_data(dst), ct_len)
+		return false
+	}
+
+	return true
+}
+
+// reset sanitizes the Context.  The Context must be
+// re-initialized to be used again.
+reset :: proc "contextless" (ctx: ^Context) {
+	mem.zero_explicit(&ctx._key, len(ctx._key))
+	ctx._key_len = 0
+	ctx._is_initialized = false
+}
@@ -0,0 +1,452 @@
+package aegis
+
+import aes "core:crypto/_aes/ct64"
+import "core:encoding/endian"
+import "core:mem"
+
+// This uses the bitlsiced 64-bit general purpose register SWAR AES
+// round function.  The intermediate state is stored in interleaved
+// but NOT orthogonalized form, as leaving things in the orthgonalized
+// format would overly complicate the update implementation.
+//
+// Note/perf: Per Frank Denis and a review of the specification, it is
+// possible to gain slightly more performance by leaving the state in
+// orthogonalized form while doing initialization, finalization, and
+// absorbing AAD.  This implementation opts out of those optimizations
+// for the sake of simplicity.
+//
+// The update function leverages the paralleism (4xblocks) at once.
+
+@(private)
+State_SW :: struct {
+	s0_0, s0_1: u64,
+	s1_0, s1_1: u64,
+	s2_0, s2_1: u64,
+	s3_0, s3_1: u64,
+	s4_0, s4_1: u64,
+	s5_0, s5_1: u64,
+	s6_0, s6_1: u64,
+	s7_0, s7_1: u64,
+	q_k, q_b:   [8]u64,
+	rate:       int,
+}
+
+@(private)
+init_sw :: proc "contextless" (ctx: ^Context, st: ^State_SW, iv: []byte) {
+	switch ctx._key_len {
+	case KEY_SIZE_128L:
+		key_0, key_1 := aes.load_interleaved(ctx._key[:16])
+		iv_0, iv_1 := aes.load_interleaved(iv)
+
+		st.s0_0, st.s0_1 = aes.xor_interleaved(key_0, key_1, iv_0, iv_1)
+		st.s1_0, st.s1_1 = aes.load_interleaved(_C1[:])
+		st.s2_0, st.s2_1 = aes.load_interleaved(_C0[:])
+		st.s3_0, st.s3_1 = st.s1_0, st.s1_1
+		st.s4_0, st.s4_1 = st.s0_0, st.s0_1
+		st.s5_0, st.s5_1 = aes.xor_interleaved(key_0, key_1, st.s2_0, st.s2_1)
+		st.s6_0, st.s6_1 = aes.xor_interleaved(key_0, key_1, st.s1_0, st.s1_1)
+		st.s7_0, st.s7_1 = st.s5_0, st.s5_1
+		st.rate = _RATE_128L
+
+		for _ in 0 ..< 10 {
+			update_sw_128l(st, iv_0, iv_1, key_0, key_1)
+		}
+	case KEY_SIZE_256:
+		k0_0, k0_1 := aes.load_interleaved(ctx._key[:16])
+		k1_0, k1_1 := aes.load_interleaved(ctx._key[16:])
+		n0_0, n0_1 := aes.load_interleaved(iv[:16])
+		n1_0, n1_1 := aes.load_interleaved(iv[16:])
+
+		st.s0_0, st.s0_1 = aes.xor_interleaved(k0_0, k0_1, n0_0, n0_1)
+		st.s1_0, st.s1_1 = aes.xor_interleaved(k1_0, k1_1, n1_0, n1_1)
+		st.s2_0, st.s2_1 = aes.load_interleaved(_C1[:])
+		st.s3_0, st.s3_1 = aes.load_interleaved(_C0[:])
+		st.s4_0, st.s4_1 = aes.xor_interleaved(k0_0, k0_1, st.s3_0, st.s3_1)
+		st.s5_0, st.s5_1 = aes.xor_interleaved(k1_0, k1_1, st.s2_0, st.s2_1)
+		st.rate = _RATE_256
+
+		u0_0, u0_1, u1_0, u1_1 := st.s0_0, st.s0_1, st.s1_0, st.s1_1
+		for _ in 0 ..< 4 {
+			update_sw_256(st, k0_0, k0_1)
+			update_sw_256(st, k1_0, k1_1)
+			update_sw_256(st, u0_0, u0_1)
+			update_sw_256(st, u1_0, u1_1)
+		}
+	}
+}
+
+@(private = "file")
+update_sw_128l :: proc "contextless" (st: ^State_SW, m0_0, m0_1, m1_0, m1_1: u64) {
+	st.q_k[0], st.q_k[4] = aes.xor_interleaved(st.s0_0, st.s0_1, m0_0, m0_1)
+	st.q_k[1], st.q_k[5] = st.s1_0, st.s1_1
+	st.q_k[2], st.q_k[6] = st.s2_0, st.s2_1
+	st.q_k[3], st.q_k[7] = st.s3_0, st.s3_1
+	aes.orthogonalize(&st.q_k)
+
+	st.q_b[0], st.q_b[4] = st.s7_0, st.s7_1
+	st.q_b[1], st.q_b[5] = st.s0_0, st.s0_1
+	st.q_b[2], st.q_b[6] = st.s1_0, st.s1_1
+	st.q_b[3], st.q_b[7] = st.s2_0, st.s2_1
+	aes.orthogonalize(&st.q_b)
+
+	aes.sub_bytes(&st.q_b)
+	aes.shift_rows(&st.q_b)
+	aes.mix_columns(&st.q_b)
+	aes.add_round_key(&st.q_b, st.q_k[:])
+	aes.orthogonalize(&st.q_b)
+
+	st.s0_0, st.s0_1 = st.q_b[0], st.q_b[4]
+	st.s1_0, st.s1_1 = st.q_b[1], st.q_b[5]
+	st.s2_0, st.s2_1 = st.q_b[2], st.q_b[6]
+	s3_0, s3_1 := st.q_b[3], st.q_b[7]
+
+	st.q_k[0], st.q_k[4] = aes.xor_interleaved(st.s4_0, st.s4_1, m1_0, m1_1)
+	st.q_k[1], st.q_k[5] = st.s5_0, st.s5_1
+	st.q_k[2], st.q_k[6] = st.s6_0, st.s6_1
+	st.q_k[3], st.q_k[7] = st.s7_0, st.s7_1
+	aes.orthogonalize(&st.q_k)
+
+	st.q_b[0], st.q_b[4] = st.s3_0, st.s3_1
+	st.q_b[1], st.q_b[5] = st.s4_0, st.s4_1
+	st.q_b[2], st.q_b[6] = st.s5_0, st.s5_1
+	st.q_b[3], st.q_b[7] = st.s6_0, st.s6_1
+	aes.orthogonalize(&st.q_b)
+
+	aes.sub_bytes(&st.q_b)
+	aes.shift_rows(&st.q_b)
+	aes.mix_columns(&st.q_b)
+	aes.add_round_key(&st.q_b, st.q_k[:])
+	aes.orthogonalize(&st.q_b)
+
+	st.s3_0, st.s3_1 = s3_0, s3_1
+	st.s4_0, st.s4_1 = st.q_b[0], st.q_b[4]
+	st.s5_0, st.s5_1 = st.q_b[1], st.q_b[5]
+	st.s6_0, st.s6_1 = st.q_b[2], st.q_b[6]
+	st.s7_0, st.s7_1 = st.q_b[3], st.q_b[7]
+}
+
+@(private = "file")
+update_sw_256 :: proc "contextless" (st: ^State_SW, m_0, m_1: u64) {
+	st.q_k[0], st.q_k[4] = aes.xor_interleaved(st.s0_0, st.s0_1, m_0, m_1)
+	st.q_k[1], st.q_k[5] = st.s1_0, st.s1_1
+	st.q_k[2], st.q_k[6] = st.s2_0, st.s2_1
+	st.q_k[3], st.q_k[7] = st.s3_0, st.s3_1
+	aes.orthogonalize(&st.q_k)
+
+	st.q_b[0], st.q_b[4] = st.s5_0, st.s5_1
+	st.q_b[1], st.q_b[5] = st.s0_0, st.s0_1
+	st.q_b[2], st.q_b[6] = st.s1_0, st.s1_1
+	st.q_b[3], st.q_b[7] = st.s2_0, st.s2_1
+	aes.orthogonalize(&st.q_b)
+
+	aes.sub_bytes(&st.q_b)
+	aes.shift_rows(&st.q_b)
+	aes.mix_columns(&st.q_b)
+	aes.add_round_key(&st.q_b, st.q_k[:])
+	aes.orthogonalize(&st.q_b)
+
+	st.s0_0, st.s0_1 = st.q_b[0], st.q_b[4]
+	st.s1_0, st.s1_1 = st.q_b[1], st.q_b[5]
+	st.s2_0, st.s2_1 = st.q_b[2], st.q_b[6]
+	s3_0, s3_1 := st.q_b[3], st.q_b[7]
+
+	st.q_k[0], st.q_k[4] = st.s4_0, st.s4_1
+	st.q_k[1], st.q_k[5] = st.s5_0, st.s5_1
+	aes.orthogonalize(&st.q_k)
+
+	st.q_b[0], st.q_b[4] = st.s3_0, st.s3_1
+	st.q_b[1], st.q_b[5] = st.s4_0, st.s4_1
+	aes.orthogonalize(&st.q_b)
+
+	aes.sub_bytes(&st.q_b)
+	aes.shift_rows(&st.q_b)
+	aes.mix_columns(&st.q_b)
+	aes.add_round_key(&st.q_b, st.q_k[:])
+	aes.orthogonalize(&st.q_b)
+
+	st.s3_0, st.s3_1 = s3_0, s3_1
+	st.s4_0, st.s4_1 = st.q_b[0], st.q_b[4]
+	st.s5_0, st.s5_1 = st.q_b[1], st.q_b[5]
+}
+
+@(private = "file")
+absorb_sw_128l :: #force_inline proc "contextless" (st: ^State_SW, ai: []byte) #no_bounds_check {
+	t0_0, t0_1 := aes.load_interleaved(ai[:16])
+	t1_0, t1_1 := aes.load_interleaved(ai[16:])
+	update_sw_128l(st, t0_0, t0_1, t1_0, t1_1)
+}
+
+@(private = "file")
+absorb_sw_256 :: #force_inline proc "contextless" (st: ^State_SW, ai: []byte) {
+	m_0, m_1 := aes.load_interleaved(ai)
+	update_sw_256(st, m_0, m_1)
+}
+
+@(private)
+absorb_sw :: proc "contextless" (st: ^State_SW, aad: []byte) #no_bounds_check {
+	ai, l := aad, len(aad)
+
+	switch st.rate {
+	case _RATE_128L:
+		for l >= _RATE_128L {
+			absorb_sw_128l(st, ai)
+			ai = ai[_RATE_128L:]
+			l -= _RATE_128L
+		}
+	case _RATE_256:
+		for l >= _RATE_256 {
+			absorb_sw_256(st, ai)
+
+			ai = ai[_RATE_256:]
+			l -= _RATE_256
+		}
+	}
+
+	// Pad out the remainder with `0`s till it is rate sized.
+	if l > 0 {
+		tmp: [_RATE_MAX]byte // AAD is not confidential.
+		copy(tmp[:], ai)
+		switch st.rate {
+		case _RATE_128L:
+			absorb_sw_128l(st, tmp[:])
+		case _RATE_256:
+			absorb_sw_256(st, tmp[:])
+		}
+	}
+}
+
+@(private = "file", require_results)
+z_sw_128l :: proc "contextless" (st: ^State_SW) -> (u64, u64, u64, u64) {
+	z0_0, z0_1 := aes.and_interleaved(st.s2_0, st.s2_1, st.s3_0, st.s3_1)
+	z0_0, z0_1 = aes.xor_interleaved(st.s1_0, st.s1_1, z0_0, z0_1)
+	z0_0, z0_1 = aes.xor_interleaved(st.s6_0, st.s6_1, z0_0, z0_1)
+
+	z1_0, z1_1 := aes.and_interleaved(st.s6_0, st.s6_1, st.s7_0, st.s7_1)
+	z1_0, z1_1 = aes.xor_interleaved(st.s5_0, st.s5_1, z1_0, z1_1)
+	z1_0, z1_1 = aes.xor_interleaved(st.s2_0, st.s2_1, z1_0, z1_1)
+
+	return z0_0, z0_1, z1_0, z1_1
+}
+
+@(private = "file", require_results)
+z_sw_256 :: proc "contextless" (st: ^State_SW) -> (u64, u64) {
+	z_0, z_1 := aes.and_interleaved(st.s2_0, st.s2_1, st.s3_0, st.s3_1)
+	z_0, z_1 = aes.xor_interleaved(st.s5_0, st.s5_1, z_0, z_1)
+	z_0, z_1 = aes.xor_interleaved(st.s4_0, st.s4_1, z_0, z_1)
+	return aes.xor_interleaved(st.s1_0, st.s1_1, z_0, z_1)
+}
+
+@(private = "file")
+enc_sw_128l :: #force_inline proc "contextless" (st: ^State_SW, ci, xi: []byte) #no_bounds_check {
+	z0_0, z0_1, z1_0, z1_1 := z_sw_128l(st)
+
+	t0_0, t0_1 := aes.load_interleaved(xi[:16])
+	t1_0, t1_1 := aes.load_interleaved(xi[16:])
+	update_sw_128l(st, t0_0, t0_1, t1_0, t1_1)
+
+	out0_0, out0_1 := aes.xor_interleaved(t0_0, t0_1, z0_0, z0_1)
+	out1_0, out1_1 := aes.xor_interleaved(t1_0, t1_1, z1_0, z1_1)
+	aes.store_interleaved(ci[:16], out0_0, out0_1)
+	aes.store_interleaved(ci[16:], out1_0, out1_1)
+}
+
+@(private = "file")
+enc_sw_256 :: #force_inline proc "contextless" (st: ^State_SW, ci, xi: []byte) #no_bounds_check {
+	z_0, z_1 := z_sw_256(st)
+
+	xi_0, xi_1 := aes.load_interleaved(xi)
+	update_sw_256(st, xi_0, xi_1)
+
+	ci_0, ci_1 := aes.xor_interleaved(xi_0, xi_1, z_0, z_1)
+	aes.store_interleaved(ci, ci_0, ci_1)
+}
+
+@(private)
+enc_sw :: proc "contextless" (st: ^State_SW, dst, src: []byte) #no_bounds_check {
+	ci, xi, l := dst, src, len(src)
+
+	switch st.rate {
+	case _RATE_128L:
+		for l >= _RATE_128L {
+			enc_sw_128l(st, ci, xi)
+			ci = ci[_RATE_128L:]
+			xi = xi[_RATE_128L:]
+			l -= _RATE_128L
+		}
+	case _RATE_256:
+		for l >= _RATE_256 {
+			enc_sw_256(st, ci, xi)
+			ci = ci[_RATE_256:]
+			xi = xi[_RATE_256:]
+			l -= _RATE_256
+		}
+	}
+
+	// Pad out the remainder with `0`s till it is rate sized.
+	if l > 0 {
+		tmp: [_RATE_MAX]byte // Ciphertext is not confidential.
+		copy(tmp[:], xi)
+		switch st.rate {
+		case _RATE_128L:
+			enc_sw_128l(st, tmp[:], tmp[:])
+		case _RATE_256:
+			enc_sw_256(st, tmp[:], tmp[:])
+		}
+		copy(ci, tmp[:l])
+	}
+}
+
+@(private = "file")
+dec_sw_128l :: #force_inline proc "contextless" (st: ^State_SW, xi, ci: []byte) #no_bounds_check {
+	z0_0, z0_1, z1_0, z1_1 := z_sw_128l(st)
+
+	t0_0, t0_1 := aes.load_interleaved(ci[:16])
+	t1_0, t1_1 := aes.load_interleaved(ci[16:])
+	out0_0, out0_1 := aes.xor_interleaved(t0_0, t0_1, z0_0, z0_1)
+	out1_0, out1_1 := aes.xor_interleaved(t1_0, t1_1, z1_0, z1_1)
+
+	update_sw_128l(st, out0_0, out0_1, out1_0, out1_1)
+	aes.store_interleaved(xi[:16], out0_0, out0_1)
+	aes.store_interleaved(xi[16:], out1_0, out1_1)
+}
+
+@(private = "file")
+dec_sw_256 :: #force_inline proc "contextless" (st: ^State_SW, xi, ci: []byte) #no_bounds_check {
+	z_0, z_1 := z_sw_256(st)
+
+	ci_0, ci_1 := aes.load_interleaved(ci)
+	xi_0, xi_1 := aes.xor_interleaved(ci_0, ci_1, z_0, z_1)
+
+	update_sw_256(st, xi_0, xi_1)
+	aes.store_interleaved(xi, xi_0, xi_1)
+}
+
+@(private = "file")
+dec_partial_sw_128l :: proc "contextless" (st: ^State_SW, xn, cn: []byte) #no_bounds_check {
+	tmp: [_RATE_128L]byte
+	defer mem.zero_explicit(&tmp, size_of(tmp))
+
+	z0_0, z0_1, z1_0, z1_1 := z_sw_128l(st)
+	copy(tmp[:], cn)
+
+	t0_0, t0_1 := aes.load_interleaved(tmp[:16])
+	t1_0, t1_1 := aes.load_interleaved(tmp[16:])
+	out0_0, out0_1 := aes.xor_interleaved(t0_0, t0_1, z0_0, z0_1)
+	out1_0, out1_1 := aes.xor_interleaved(t1_0, t1_1, z1_0, z1_1)
+
+	aes.store_interleaved(tmp[:16], out0_0, out0_1)
+	aes.store_interleaved(tmp[16:], out1_0, out1_1)
+	copy(xn, tmp[:])
+
+	for off := len(xn); off < _RATE_128L; off += 1 {
+		tmp[off] = 0
+	}
+	out0_0, out0_1 = aes.load_interleaved(tmp[:16])
+	out1_0, out1_1 = aes.load_interleaved(tmp[16:])
+	update_sw_128l(st, out0_0, out0_1, out1_0, out1_1)
+}
+
+@(private = "file")
+dec_partial_sw_256 :: proc "contextless" (st: ^State_SW, xn, cn: []byte) #no_bounds_check {
+	tmp: [_RATE_256]byte
+	defer mem.zero_explicit(&tmp, size_of(tmp))
+
+	z_0, z_1 := z_sw_256(st)
+	copy(tmp[:], cn)
+
+	cn_0, cn_1 := aes.load_interleaved(tmp[:])
+	xn_0, xn_1 := aes.xor_interleaved(cn_0, cn_1, z_0, z_1)
+
+	aes.store_interleaved(tmp[:], xn_0, xn_1)
+	copy(xn, tmp[:])
+
+	for off := len(xn); off < _RATE_256; off += 1 {
+		tmp[off] = 0
+	}
+	xn_0, xn_1 = aes.load_interleaved(tmp[:])
+	update_sw_256(st, xn_0, xn_1)
+}
+
+@(private)
+dec_sw :: proc "contextless" (st: ^State_SW, dst, src: []byte) #no_bounds_check {
+	xi, ci, l := dst, src, len(src)
+
+	switch st.rate {
+	case _RATE_128L:
+		for l >= _RATE_128L {
+			dec_sw_128l(st, xi, ci)
+			xi = xi[_RATE_128L:]
+			ci = ci[_RATE_128L:]
+			l -= _RATE_128L
+		}
+	case _RATE_256:
+		for l >= _RATE_256 {
+			dec_sw_256(st, xi, ci)
+			xi = xi[_RATE_256:]
+			ci = ci[_RATE_256:]
+			l -= _RATE_256
+		}
+	}
+
+	// Process the remainder.
+	if l > 0 {
+		switch st.rate {
+		case _RATE_128L:
+			dec_partial_sw_128l(st, xi, ci)
+		case _RATE_256:
+			dec_partial_sw_256(st, xi, ci)
+		}
+	}
+}
+
+@(private)
+finalize_sw :: proc "contextless" (st: ^State_SW, tag: []byte, ad_len, msg_len: int) {
+	tmp: [16]byte
+	endian.unchecked_put_u64le(tmp[0:], u64(ad_len) * 8)
+	endian.unchecked_put_u64le(tmp[8:], u64(msg_len) * 8)
+
+	t_0, t_1 := aes.load_interleaved(tmp[:])
+
+	t0_0, t0_1, t1_0, t1_1: u64 = ---, ---, ---, ---
+	switch st.rate {
+	case _RATE_128L:
+		t_0, t_1 = aes.xor_interleaved(st.s2_0, st.s2_1, t_0, t_1)
+		for _ in 0 ..< 7 {
+			update_sw_128l(st, t_0, t_1, t_0, t_1)
+		}
+
+		t0_0, t0_1 = aes.xor_interleaved(st.s0_0, st.s0_1, st.s1_0, st.s1_1)
+		t0_0, t0_1 = aes.xor_interleaved(t0_0, t0_1, st.s2_0, st.s2_1)
+		t0_0, t0_1 = aes.xor_interleaved(t0_0, t0_1, st.s3_0, st.s3_1)
+
+		t1_0, t1_1 = aes.xor_interleaved(st.s4_0, st.s4_1, st.s5_0, st.s5_1)
+		t1_0, t1_1 = aes.xor_interleaved(t1_0, t1_1, st.s6_0, st.s6_1)
+		if len(tag) == TAG_SIZE_256 {
+			t1_0, t1_1 = aes.xor_interleaved(t1_0, t1_1, st.s7_0, st.s7_1)
+		}
+	case _RATE_256:
+		t_0, t_1 = aes.xor_interleaved(st.s3_0, st.s3_1, t_0, t_1)
+		for _ in 0 ..< 7 {
+			update_sw_256(st, t_0, t_1)
+		}
+
+		t0_0, t0_1 = aes.xor_interleaved(st.s0_0, st.s0_1, st.s1_0, st.s1_1)
+		t0_0, t0_1 = aes.xor_interleaved(t0_0, t0_1, st.s2_0, st.s2_1)
+
+		t1_0, t1_1 = aes.xor_interleaved(st.s3_0, st.s3_1, st.s4_0, st.s4_1)
+		t1_0, t1_1 = aes.xor_interleaved(t1_0, t1_1, st.s5_0, st.s5_1)
+	}
+	switch len(tag) {
+	case TAG_SIZE_128:
+		t0_0, t0_1 = aes.xor_interleaved(t0_0, t0_1, t1_0, t1_1)
+		aes.store_interleaved(tag, t0_0, t0_1)
+	case TAG_SIZE_256:
+		aes.store_interleaved(tag[:16], t0_0, t0_1)
+		aes.store_interleaved(tag[16:], t1_0, t1_1)
+	}
+}
+
+@(private)
+reset_state_sw :: proc "contextless" (st: ^State_SW) {
+	mem.zero_explicit(st, size_of(st^))
+}
@@ -0,0 +1,44 @@
+#+build !amd64
+package aegis
+
+@(private = "file")
+ERR_HW_NOT_SUPPORTED :: "crypto/aegis: hardware implementation unsupported"
+
+@(private)
+State_HW :: struct {}
+
+// is_hardware_accelerated returns true iff hardware accelerated AEGIS
+// is supported.
+is_hardware_accelerated :: proc "contextless" () -> bool {
+	return false
+}
+
+@(private)
+init_hw :: proc "contextless" (ctx: ^Context, st: ^State_HW, iv: []byte) {
+	panic_contextless(ERR_HW_NOT_SUPPORTED)
+}
+
+@(private)
+absorb_hw :: proc "contextless" (st: ^State_HW, aad: []byte) {
+	panic_contextless(ERR_HW_NOT_SUPPORTED)
+}
+
+@(private)
+enc_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) {
+	panic_contextless(ERR_HW_NOT_SUPPORTED)
+}
+
+@(private)
+dec_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) {
+	panic_contextless(ERR_HW_NOT_SUPPORTED)
+}
+
+@(private)
+finalize_hw :: proc "contextless" (st: ^State_HW, tag: []byte, ad_len, msg_len: int) {
+	panic_contextless(ERR_HW_NOT_SUPPORTED)
+}
+
+@(private)
+reset_state_hw :: proc "contextless" (st: ^State_HW) {
+	panic_contextless(ERR_HW_NOT_SUPPORTED)
+}
@@ -0,0 +1,389 @@
+#+build amd64
+package aegis
+
+import "base:intrinsics"
+import "core:crypto/aes"
+import "core:encoding/endian"
+import "core:mem"
+import "core:simd/x86"
+
+@(private)
+State_HW :: struct {
+	s0:   x86.__m128i,
+	s1:   x86.__m128i,
+	s2:   x86.__m128i,
+	s3:   x86.__m128i,
+	s4:   x86.__m128i,
+	s5:   x86.__m128i,
+	s6:   x86.__m128i,
+	s7:   x86.__m128i,
+	rate: int,
+}
+
+// is_hardware_accelerated returns true iff hardware accelerated AEGIS
+// is supported.
+is_hardware_accelerated :: proc "contextless" () -> bool {
+	return aes.is_hardware_accelerated()
+}
+
+@(private, enable_target_feature = "sse2,aes")
+init_hw :: proc "contextless" (ctx: ^Context, st: ^State_HW, iv: []byte) {
+	switch ctx._key_len {
+	case KEY_SIZE_128L:
+		key := intrinsics.unaligned_load((^x86.__m128i)(&ctx._key[0]))
+		iv := intrinsics.unaligned_load((^x86.__m128i)(raw_data(iv)))
+
+		st.s0 = x86._mm_xor_si128(key, iv)
+		st.s1 = intrinsics.unaligned_load((^x86.__m128i)(&_C1[0]))
+		st.s2 = intrinsics.unaligned_load((^x86.__m128i)(&_C0[0]))
+		st.s3 = st.s1
+		st.s4 = st.s0
+		st.s5 = x86._mm_xor_si128(key, st.s2) // key ^ C0
+		st.s6 = x86._mm_xor_si128(key, st.s1) // key ^ C1
+		st.s7 = st.s5
+		st.rate = _RATE_128L
+
+		for _ in 0 ..< 10 {
+			update_hw_128l(st, iv, key)
+		}
+	case KEY_SIZE_256:
+		k0 := intrinsics.unaligned_load((^x86.__m128i)(&ctx._key[0]))
+		k1 := intrinsics.unaligned_load((^x86.__m128i)(&ctx._key[16]))
+		n0 := intrinsics.unaligned_load((^x86.__m128i)(&iv[0]))
+		n1 := intrinsics.unaligned_load((^x86.__m128i)(&iv[16]))
+
+		st.s0 = x86._mm_xor_si128(k0, n0)
+		st.s1 = x86._mm_xor_si128(k1, n1)
+		st.s2 = intrinsics.unaligned_load((^x86.__m128i)(&_C1[0]))
+		st.s3 = intrinsics.unaligned_load((^x86.__m128i)(&_C0[0]))
+		st.s4 = x86._mm_xor_si128(k0, st.s3) // k0 ^ C0
+		st.s5 = x86._mm_xor_si128(k1, st.s2) // k1 ^ C1
+		st.rate = _RATE_256
+
+		u0, u1 := st.s0, st.s1
+		for _ in 0 ..< 4 {
+			update_hw_256(st, k0)
+			update_hw_256(st, k1)
+			update_hw_256(st, u0)
+			update_hw_256(st, u1)
+		}
+	}
+}
+
+@(private = "file", enable_target_feature = "sse2,aes")
+update_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, m0, m1: x86.__m128i) {
+	s0_ := x86._mm_aesenc_si128(st.s7, x86._mm_xor_si128(st.s0, m0))
+	s1_ := x86._mm_aesenc_si128(st.s0, st.s1)
+	s2_ := x86._mm_aesenc_si128(st.s1, st.s2)
+	s3_ := x86._mm_aesenc_si128(st.s2, st.s3)
+	s4_ := x86._mm_aesenc_si128(st.s3, x86._mm_xor_si128(st.s4, m1))
+	s5_ := x86._mm_aesenc_si128(st.s4, st.s5)
+	s6_ := x86._mm_aesenc_si128(st.s5, st.s6)
+	s7_ := x86._mm_aesenc_si128(st.s6, st.s7)
+	st.s0, st.s1, st.s2, st.s3, st.s4, st.s5, st.s6, st.s7 = s0_, s1_, s2_, s3_, s4_, s5_, s6_, s7_
+}
+
+@(private = "file", enable_target_feature = "sse2,aes")
+update_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, m: x86.__m128i) {
+	s0_ := x86._mm_aesenc_si128(st.s5, x86._mm_xor_si128(st.s0, m))
+	s1_ := x86._mm_aesenc_si128(st.s0, st.s1)
+	s2_ := x86._mm_aesenc_si128(st.s1, st.s2)
+	s3_ := x86._mm_aesenc_si128(st.s2, st.s3)
+	s4_ := x86._mm_aesenc_si128(st.s3, st.s4)
+	s5_ := x86._mm_aesenc_si128(st.s4, st.s5)
+	st.s0, st.s1, st.s2, st.s3, st.s4, st.s5 = s0_, s1_, s2_, s3_, s4_, s5_
+}
+
+@(private = "file", enable_target_feature = "sse2,aes")
+absorb_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) {
+	t0 := intrinsics.unaligned_load((^x86.__m128i)(&ai[0]))
+	t1 := intrinsics.unaligned_load((^x86.__m128i)(&ai[16]))
+	update_hw_128l(st, t0, t1)
+}
+
+@(private = "file", enable_target_feature = "sse2,aes")
+absorb_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) {
+	m := intrinsics.unaligned_load((^x86.__m128i)(&ai[0]))
+	update_hw_256(st, m)
+}
+
+@(private, enable_target_feature = "sse2,aes")
+absorb_hw :: proc "contextless" (st: ^State_HW, aad: []byte) #no_bounds_check {
+	ai, l := aad, len(aad)
+
+	switch st.rate {
+	case _RATE_128L:
+		for l >= _RATE_128L {
+			absorb_hw_128l(st, ai)
+			ai = ai[_RATE_128L:]
+			l -= _RATE_128L
+		}
+	case _RATE_256:
+		for l >= _RATE_256 {
+			absorb_hw_256(st, ai)
+
+			ai = ai[_RATE_256:]
+			l -= _RATE_256
+		}
+	}
+
+	// Pad out the remainder with `0`s till it is rate sized.
+	if l > 0 {
+		tmp: [_RATE_MAX]byte // AAD is not confidential.
+		copy(tmp[:], ai)
+		switch st.rate {
+		case _RATE_128L:
+			absorb_hw_128l(st, tmp[:])
+		case _RATE_256:
+			absorb_hw_256(st, tmp[:])
+		}
+	}
+}
+
+@(private = "file", enable_target_feature = "sse2", require_results)
+z_hw_128l :: #force_inline proc "contextless" (st: ^State_HW) -> (x86.__m128i, x86.__m128i) {
+	z0 := x86._mm_xor_si128(
+		st.s6,
+		x86._mm_xor_si128(
+			st.s1,
+			x86._mm_and_si128(st.s2, st.s3),
+		),
+	)
+	z1 := x86._mm_xor_si128(
+		st.s2,
+		x86._mm_xor_si128(
+			st.s5,
+			x86._mm_and_si128(st.s6, st.s7),
+		),
+	)
+	return z0, z1
+}
+
+@(private = "file", enable_target_feature = "sse2", require_results)
+z_hw_256 :: #force_inline proc "contextless" (st: ^State_HW) -> x86.__m128i {
+	return x86._mm_xor_si128(
+		st.s1,
+		x86._mm_xor_si128(
+			st.s4,
+			x86._mm_xor_si128(
+				st.s5,
+				x86._mm_and_si128(st.s2, st.s3),
+			),
+		),
+	)
+}
+
+@(private = "file", enable_target_feature = "sse2,aes")
+enc_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check {
+	z0, z1 := z_hw_128l(st)
+
+	t0 := intrinsics.unaligned_load((^x86.__m128i)(&xi[0]))
+	t1 := intrinsics.unaligned_load((^x86.__m128i)(&xi[16]))
+	update_hw_128l(st, t0, t1)
+
+	out0 := x86._mm_xor_si128(t0, z0)
+	out1 := x86._mm_xor_si128(t1, z1)
+	intrinsics.unaligned_store((^x86.__m128i)(&ci[0]), out0)
+	intrinsics.unaligned_store((^x86.__m128i)(&ci[16]), out1)
+}
+
+@(private = "file", enable_target_feature = "sse2,aes")
+enc_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check {
+	z := z_hw_256(st)
+
+	xi_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(xi)))
+	update_hw_256(st, xi_)
+
+	ci_ := x86._mm_xor_si128(xi_, z)
+	intrinsics.unaligned_store((^x86.__m128i)(raw_data(ci)), ci_)
+}
+
+@(private, enable_target_feature = "sse2,aes")
+enc_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check {
+	ci, xi, l := dst, src, len(src)
+
+	switch st.rate {
+	case _RATE_128L:
+		for l >= _RATE_128L {
+			enc_hw_128l(st, ci, xi)
+			ci = ci[_RATE_128L:]
+			xi = xi[_RATE_128L:]
+			l -= _RATE_128L
+		}
+	case _RATE_256:
+		for l >= _RATE_256 {
+			enc_hw_256(st, ci, xi)
+			ci = ci[_RATE_256:]
+			xi = xi[_RATE_256:]
+			l -= _RATE_256
+		}
+	}
+
+	// Pad out the remainder with `0`s till it is rate sized.
+	if l > 0 {
+		tmp: [_RATE_MAX]byte // Ciphertext is not confidential.
+		copy(tmp[:], xi)
+		switch st.rate {
+		case _RATE_128L:
+			enc_hw_128l(st, tmp[:], tmp[:])
+		case _RATE_256:
+			enc_hw_256(st, tmp[:], tmp[:])
+		}
+		copy(ci, tmp[:l])
+	}
+}
+
+@(private = "file", enable_target_feature = "sse2,aes")
+dec_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check {
+	z0, z1 := z_hw_128l(st)
+
+	t0 := intrinsics.unaligned_load((^x86.__m128i)(&ci[0]))
+	t1 := intrinsics.unaligned_load((^x86.__m128i)(&ci[16]))
+	out0 := x86._mm_xor_si128(t0, z0)
+	out1 := x86._mm_xor_si128(t1, z1)
+
+	update_hw_128l(st, out0, out1)
+	intrinsics.unaligned_store((^x86.__m128i)(&xi[0]), out0)
+	intrinsics.unaligned_store((^x86.__m128i)(&xi[16]), out1)
+}
+
+@(private = "file", enable_target_feature = "sse2,aes")
+dec_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check {
+	z := z_hw_256(st)
+
+	ci_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(ci)))
+	xi_ := x86._mm_xor_si128(ci_, z)
+
+	update_hw_256(st, xi_)
+	intrinsics.unaligned_store((^x86.__m128i)(raw_data(xi)), xi_)
+}
+
+@(private = "file", enable_target_feature = "sse2,aes")
+dec_partial_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check {
+	tmp: [_RATE_128L]byte
+	defer mem.zero_explicit(&tmp, size_of(tmp))
+
+	z0, z1 := z_hw_128l(st)
+	copy(tmp[:], cn)
+
+	t0 := intrinsics.unaligned_load((^x86.__m128i)(&tmp[0]))
+	t1 := intrinsics.unaligned_load((^x86.__m128i)(&tmp[16]))
+	out0 := x86._mm_xor_si128(t0, z0)
+	out1 := x86._mm_xor_si128(t1, z1)
+
+	intrinsics.unaligned_store((^x86.__m128i)(&tmp[0]), out0)
+	intrinsics.unaligned_store((^x86.__m128i)(&tmp[16]), out1)
+	copy(xn, tmp[:])
+
+	for off := len(xn); off < _RATE_128L; off += 1 {
+		tmp[off] = 0
+	}
+	out0 = intrinsics.unaligned_load((^x86.__m128i)(&tmp[0])) // v0
+	out1 = intrinsics.unaligned_load((^x86.__m128i)(&tmp[16])) // v1
+	update_hw_128l(st, out0, out1)
+}
+
+@(private = "file", enable_target_feature = "sse2,aes")
+dec_partial_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check {
+	tmp: [_RATE_256]byte
+	defer mem.zero_explicit(&tmp, size_of(tmp))
+
+	z := z_hw_256(st)
+	copy(tmp[:], cn)
+
+	cn_ := intrinsics.unaligned_load((^x86.__m128i)(&tmp[0]))
+	xn_ := x86._mm_xor_si128(cn_, z)
+
+	intrinsics.unaligned_store((^x86.__m128i)(&tmp[0]), xn_)
+	copy(xn, tmp[:])
+
+	for off := len(xn); off < _RATE_256; off += 1 {
+		tmp[off] = 0
+	}
+	xn_ = intrinsics.unaligned_load((^x86.__m128i)(&tmp[0]))
+	update_hw_256(st, xn_)
+}
+
+@(private, enable_target_feature = "sse2,aes")
+dec_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check {
+	xi, ci, l := dst, src, len(src)
+
+	switch st.rate {
+	case _RATE_128L:
+		for l >= _RATE_128L {
+			dec_hw_128l(st, xi, ci)
+			xi = xi[_RATE_128L:]
+			ci = ci[_RATE_128L:]
+			l -= _RATE_128L
+		}
+	case _RATE_256:
+		for l >= _RATE_256 {
+			dec_hw_256(st, xi, ci)
+			xi = xi[_RATE_256:]
+			ci = ci[_RATE_256:]
+			l -= _RATE_256
+		}
+	}
+
+	// Process the remainder.
+	if l > 0 {
+		switch st.rate {
+		case _RATE_128L:
+			dec_partial_hw_128l(st, xi, ci)
+		case _RATE_256:
+			dec_partial_hw_256(st, xi, ci)
+		}
+	}
+}
+
+@(private, enable_target_feature = "sse2,aes")
+finalize_hw :: proc "contextless" (st: ^State_HW, tag: []byte, ad_len, msg_len: int) {
+	tmp: [16]byte
+	endian.unchecked_put_u64le(tmp[0:], u64(ad_len) * 8)
+	endian.unchecked_put_u64le(tmp[8:], u64(msg_len) * 8)
+
+	t := intrinsics.unaligned_load((^x86.__m128i)(&tmp[0]))
+
+	t0, t1: x86.__m128i = ---, ---
+	switch st.rate {
+	case _RATE_128L:
+		t = x86._mm_xor_si128(st.s2, t)
+		for _ in 0 ..< 7 {
+			update_hw_128l(st, t, t)
+		}
+
+		t0 = x86._mm_xor_si128(st.s0, st.s1)
+		t0 = x86._mm_xor_si128(t0, st.s2)
+		t0 = x86._mm_xor_si128(t0, st.s3)
+
+		t1 = x86._mm_xor_si128(st.s4, st.s5)
+		t1 = x86._mm_xor_si128(t1, st.s6)
+		if len(tag) == TAG_SIZE_256 {
+			t1 = x86._mm_xor_si128(t1, st.s7)
+		}
+	case _RATE_256:
+		t = x86._mm_xor_si128(st.s3, t)
+		for _ in 0 ..< 7 {
+			update_hw_256(st, t)
+		}
+
+		t0 = x86._mm_xor_si128(st.s0, st.s1)
+		t0 = x86._mm_xor_si128(t0, st.s2)
+
+		t1 = x86._mm_xor_si128(st.s3, st.s4)
+		t1 = x86._mm_xor_si128(t1, st.s5)
+	}
+	switch len(tag) {
+	case TAG_SIZE_128:
+		t0 = x86._mm_xor_si128(t0, t1)
+		intrinsics.unaligned_store((^x86.__m128i)(&tag[0]), t0)
+	case TAG_SIZE_256:
+		intrinsics.unaligned_store((^x86.__m128i)(&tag[0]), t0)
+		intrinsics.unaligned_store((^x86.__m128i)(&tag[16]), t1)
+	}
+}
+
+@(private)
+reset_state_hw :: proc "contextless" (st: ^State_HW) {
+	mem.zero_explicit(st, size_of(st^))
+}
@@ -21,9 +21,7 @@ Context_CTR :: struct {

 // init_ctr initializes a Context_CTR with the provided key and IV.
 init_ctr :: proc(ctx: ^Context_CTR, key, iv: []byte, impl := DEFAULT_IMPLEMENTATION) {
-	if len(iv) != CTR_IV_SIZE {
-		panic("crypto/aes: invalid CTR IV size")
-	}
+	ensure(len(iv) == CTR_IV_SIZE, "crypto/aes: invalid CTR IV size")

 	init_impl(&ctx._impl, key, impl)
 	ctx._off = BLOCK_SIZE
@@ -36,16 +34,14 @@ init_ctr :: proc(ctx: ^Context_CTR, key, iv: []byte, impl := DEFAULT_IMPLEMENTAT
 // keystream, and writes the resulting output to dst.  dst and src MUST
 // alias exactly or not at all.
 xor_bytes_ctr :: proc(ctx: ^Context_CTR, dst, src: []byte) {
-	assert(ctx._is_initialized)
+	ensure(ctx._is_initialized)

 	src, dst := src, dst
 	if dst_len := len(dst); dst_len < len(src) {
 		src = src[:dst_len]
 	}

-	if bytes.alias_inexactly(dst, src) {
-		panic("crypto/aes: dst and src alias inexactly")
-	}
+	ensure(!bytes.alias_inexactly(dst, src), "crypto/aes: dst and src alias inexactly")

 	#no_bounds_check for remaining := len(src); remaining > 0; {
 		// Process multiple blocks at once
@@ -82,7 +78,7 @@ xor_bytes_ctr :: proc(ctx: ^Context_CTR, dst, src: []byte) {

 // keystream_bytes_ctr fills dst with the raw AES-CTR keystream output.
 keystream_bytes_ctr :: proc(ctx: ^Context_CTR, dst: []byte) {
-	assert(ctx._is_initialized)
+	ensure(ctx._is_initialized)

 	dst := dst
 	#no_bounds_check for remaining := len(dst); remaining > 0; {
@@ -19,11 +19,9 @@ init_ecb :: proc(ctx: ^Context_ECB, key: []byte, impl := DEFAULT_IMPLEMENTATION)

 // encrypt_ecb encrypts the BLOCK_SIZE buffer src, and writes the result to dst.
 encrypt_ecb :: proc(ctx: ^Context_ECB, dst, src: []byte) {
-	assert(ctx._is_initialized)
-
-	if len(dst) != BLOCK_SIZE || len(src) != BLOCK_SIZE {
-		panic("crypto/aes: invalid buffer size(s)")
-	}
+	ensure(ctx._is_initialized)
+	ensure(len(dst) == BLOCK_SIZE, "crypto/aes: invalid dst size")
+	ensure(len(dst) == BLOCK_SIZE, "crypto/aes: invalid src size")

 	switch &impl in ctx._impl {
 	case ct64.Context:
@@ -35,11 +33,9 @@ encrypt_ecb :: proc(ctx: ^Context_ECB, dst, src: []byte) {

 // decrypt_ecb decrypts the BLOCK_SIZE buffer src, and writes the result to dst.
 decrypt_ecb :: proc(ctx: ^Context_ECB, dst, src: []byte) {
-	assert(ctx._is_initialized)
-
-	if len(dst) != BLOCK_SIZE || len(src) != BLOCK_SIZE {
-		panic("crypto/aes: invalid buffer size(s)")
-	}
+	ensure(ctx._is_initialized)
+	ensure(len(dst) == BLOCK_SIZE, "crypto/aes: invalid dst size")
+	ensure(len(dst) == BLOCK_SIZE, "crypto/aes: invalid src size")

 	switch &impl in ctx._impl {
 	case ct64.Context:
@@ -36,15 +36,11 @@ init_gcm :: proc(ctx: ^Context_GCM, key: []byte, impl := DEFAULT_IMPLEMENTATION)
 //
 // dst and plaintext MUST alias exactly or not at all.
 seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, iv, aad, plaintext: []byte) {
-	assert(ctx._is_initialized)
+	ensure(ctx._is_initialized)

 	gcm_validate_common_slice_sizes(tag, iv, aad, plaintext)
-	if len(dst) != len(plaintext) {
-		panic("crypto/aes: invalid destination ciphertext size")
-	}
-	if bytes.alias_inexactly(dst, plaintext) {
-		panic("crypto/aes: dst and plaintext alias inexactly")
-	}
+	ensure(len(dst) == len(plaintext), "crypto/aes: invalid destination ciphertext size")
+	ensure(!bytes.alias_inexactly(dst, plaintext), "crypto/aes: dst and plaintext alias inexactly")

 	if impl, is_hw := ctx._impl.(Context_Impl_Hardware); is_hw {
 		gcm_seal_hw(&impl, dst, tag, iv, aad, plaintext)
@@ -76,15 +72,11 @@ seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, iv, aad, plaintext: []byte) {
 // dst and plaintext MUST alias exactly or not at all.
@(require_results)
 open_gcm :: proc(ctx: ^Context_GCM, dst, iv, aad, ciphertext, tag: []byte) -> bool {
-	assert(ctx._is_initialized)
+	ensure(ctx._is_initialized)

 	gcm_validate_common_slice_sizes(tag, iv, aad, ciphertext)
-	if len(dst) != len(ciphertext) {
-		panic("crypto/aes: invalid destination plaintext size")
-	}
-	if bytes.alias_inexactly(dst, ciphertext) {
-		panic("crypto/aes: dst and ciphertext alias inexactly")
-	}
+	ensure(len(dst) == len(ciphertext), "crypto/aes: invalid destination plaintext size")
+	ensure(!bytes.alias_inexactly(dst, ciphertext), "crypto/aes: dst and ciphertext alias inexactly")

 	if impl, is_hw := ctx._impl.(Context_Impl_Hardware); is_hw {
 		return gcm_open_hw(&impl, dst, iv, aad, ciphertext, tag)
@@ -122,21 +114,13 @@ reset_gcm :: proc "contextless" (ctx: ^Context_GCM) {

@(private = "file")
 gcm_validate_common_slice_sizes :: proc(tag, iv, aad, text: []byte) {
-	if len(tag) != GCM_TAG_SIZE {
-		panic("crypto/aes: invalid GCM tag size")
-	}
+	ensure(len(tag) == GCM_TAG_SIZE, "crypto/aes: invalid GCM tag size")

 	// The specification supports IVs in the range [1, 2^64) bits.
-	if l := len(iv); l == 0 || u64(l) >= GCM_IV_SIZE_MAX {
-		panic("crypto/aes: invalid GCM IV size")
-	}
+	ensure(len(iv) == 0 || u64(len(iv)) <= GCM_IV_SIZE_MAX, "crypto/aes: invalid GCM IV size")

-	if aad_len := u64(len(aad)); aad_len > GCM_A_MAX {
-		panic("crypto/aes: oversized GCM aad")
-	}
-	if text_len := u64(len(text)); text_len > GCM_P_MAX {
-		panic("crypto/aes: oversized GCM src data")
-	}
+	ensure(u64(len(aad)) <= GCM_A_MAX, "crypto/aes: oversized GCM aad")
+	ensure(u64(len(text)) <= GCM_P_MAX, "crypto/aes: oversized GCM data")
 }

@(private = "file")
@@ -235,7 +235,7 @@ gctr_hw :: proc(
 // BUG: Sticking this in gctr_hw (like the other implementations) crashes
 // the compiler.
 //
-// src/check_expr.cpp(7892): Assertion Failure: `c->curr_proc_decl->entity`
+// src/check_expr.cpp(8104): Assertion Failure: `c->curr_proc_decl->entity`
@(private = "file", enable_target_feature = "sse4.1")
 hw_inc_ctr32 :: #force_inline proc "contextless" (src: ^x86.__m128i, ctr: u32) -> (x86.__m128i, u32) {
 	ret := x86._mm_insert_epi32(src^, i32(intrinsics.byte_swap(ctr)), 3)
@@ -18,7 +18,7 @@ package blake2b
 import "../_blake2"

 // DIGEST_SIZE is the BLAKE2b digest size in bytes.
-DIGEST_SIZE :: 64
+DIGEST_SIZE :: _blake2.BLAKE2B_SIZE

 // BLOCK_SIZE is the BLAKE2b block size in bytes.
 BLOCK_SIZE :: _blake2.BLAKE2B_BLOCK_SIZE
@@ -27,9 +27,11 @@ BLOCK_SIZE :: _blake2.BLAKE2B_BLOCK_SIZE
 Context :: _blake2.Blake2b_Context

 // init initializes a Context with the default BLAKE2b config.
-init :: proc(ctx: ^Context) {
+init :: proc(ctx: ^Context, digest_size := DIGEST_SIZE) {
+	ensure(digest_size <= _blake2.MAX_SIZE, "crypto/blake2b: invalid digest size")
+
 	cfg: _blake2.Blake2_Config
-	cfg.size = _blake2.BLAKE2B_SIZE
+	cfg.size = u8(digest_size)
 	_blake2.init(ctx, &cfg)
 }

@@ -18,7 +18,7 @@ package blake2s
 import "../_blake2"

 // DIGEST_SIZE is the BLAKE2s digest size in bytes.
-DIGEST_SIZE :: 32
+DIGEST_SIZE :: _blake2.BLAKE2S_SIZE

 // BLOCK_SIZE is the BLAKE2s block size in bytes.
 BLOCK_SIZE :: _blake2.BLAKE2S_BLOCK_SIZE
@@ -27,9 +27,11 @@ BLOCK_SIZE :: _blake2.BLAKE2S_BLOCK_SIZE
 Context :: _blake2.Blake2s_Context

 // init initializes a Context with the default BLAKE2s config.
-init :: proc(ctx: ^Context) {
+init :: proc(ctx: ^Context, digest_size := DIGEST_SIZE) {
+	ensure(digest_size <= _blake2.MAX_SIZE, "crypto/blake2s: invalid digest size")
+
 	cfg: _blake2.Blake2_Config
-	cfg.size = _blake2.BLAKE2S_SIZE
+	cfg.size = u8(digest_size)
 	_blake2.init(ctx, &cfg)
 }

@@ -27,12 +27,8 @@ Context :: struct {
 // init inititializes a Context for ChaCha20 or XChaCha20 with the provided
 // key and iv.
 init :: proc(ctx: ^Context, key, iv: []byte, impl := DEFAULT_IMPLEMENTATION) {
-	if len(key) != KEY_SIZE {
-		panic("crypto/chacha20: invalid (X)ChaCha20 key size")
-	}
-	if l := len(iv); l != IV_SIZE && l != XIV_SIZE {
-		panic("crypto/chacha20: invalid (X)ChaCha20 IV size")
-	}
+	ensure(len(key) == KEY_SIZE, "crypto/chacha20: invalid (X)ChaCha20 key size")
+	ensure(len(iv) == IV_SIZE || len(iv) == XIV_SIZE, "crypto/chacha20: invalid (X)ChaCha20 IV size")

 	k, n := key, iv

@@ -67,16 +63,14 @@ seek :: proc(ctx: ^Context, block_nr: u64) {
 // keystream, and writes the resulting output to dst.  Dst and src MUST
 // alias exactly or not at all.
 xor_bytes :: proc(ctx: ^Context, dst, src: []byte) {
-	assert(ctx._state._is_initialized)
+	ensure(ctx._state._is_initialized)

 	src, dst := src, dst
 	if dst_len := len(dst); dst_len < len(src) {
 		src = src[:dst_len]
 	}

-	if bytes.alias_inexactly(dst, src) {
-		panic("crypto/chacha20: dst and src alias inexactly")
-	}
+	ensure(!bytes.alias_inexactly(dst, src), "crypto/chacha20: dst and src alias inexactly")

 	st := &ctx._state
 	#no_bounds_check for remaining := len(src); remaining > 0; {
@@ -114,7 +108,7 @@ xor_bytes :: proc(ctx: ^Context, dst, src: []byte) {

 // keystream_bytes fills dst with the raw (X)ChaCha20 keystream output.
 keystream_bytes :: proc(ctx: ^Context, dst: []byte) {
-	assert(ctx._state._is_initialized)
+	ensure(ctx._state._is_initialized)

 	dst, st := dst, &ctx._state
 	#no_bounds_check for remaining := len(dst); remaining > 0; {
@@ -29,13 +29,9 @@ _P_MAX :: 64 * 0xffffffff // 64 * (2^32-1)

@(private)
 _validate_common_slice_sizes :: proc (tag, iv, aad, text: []byte, is_xchacha: bool) {
-	if len(tag) != TAG_SIZE {
-		panic("crypto/chacha20poly1305: invalid destination tag size")
-	}
 	expected_iv_len := is_xchacha ? XIV_SIZE : IV_SIZE
-	if len(iv) != expected_iv_len {
-		panic("crypto/chacha20poly1305: invalid IV size")
-	}
+	ensure(len(tag) == TAG_SIZE, "crypto/chacha20poly1305: invalid destination tag size")
+	ensure(len(iv) == expected_iv_len, "crypto/chacha20poly1305: invalid IV size")

 	#assert(size_of(int) == 8 || size_of(int) <= 4)
 	when size_of(int) == 8 {
@@ -45,13 +41,11 @@ _validate_common_slice_sizes :: proc (tag, iv, aad, text: []byte, is_xchacha: bo
 		// A_MAX is limited by size_of(int), so there is no need to
 		// enforce it. P_MAX only needs to be checked on 64-bit targets,
 		// for reasons that should be obvious.
-		if text_len := len(text); text_len > _P_MAX {
-			panic("crypto/chacha20poly1305: oversized src data")
-		}
+		ensure(len(text) <= _P_MAX, "crypto/chacha20poly1305: oversized src data")
 	}
 }

-@(private)
+@(private, rodata)
 _PAD: [16]byte

@(private)
@@ -71,9 +65,7 @@ Context :: struct {

 // init initializes a Context with the provided key, for AEAD_CHACHA20_POLY1305.
 init :: proc(ctx: ^Context, key: []byte, impl := chacha20.DEFAULT_IMPLEMENTATION) {
-	if len(key) != KEY_SIZE {
-		panic("crypto/chacha20poly1305: invalid key size")
-	}
+	ensure(len(key) == KEY_SIZE, "crypto/chacha20poly1305: invalid key size")

 	copy(ctx._key[:], key)
 	ctx._impl = impl
@@ -96,11 +88,11 @@ init_xchacha :: proc(ctx: ^Context, key: []byte, impl := chacha20.DEFAULT_IMPLEM
 //
 // dst and plaintext MUST alias exactly or not at all.
 seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {
+	ensure(ctx._is_initialized)
+
 	ciphertext := dst
 	_validate_common_slice_sizes(tag, iv, aad, plaintext, ctx._is_xchacha)
-	if len(ciphertext) != len(plaintext) {
-		panic("crypto/chacha20poly1305: invalid destination ciphertext size")
-	}
+	ensure(len(ciphertext) == len(plaintext), "crypto/chacha20poly1305: invalid destination ciphertext size")

 	stream_ctx: chacha20.Context = ---
 	chacha20.init(&stream_ctx, ctx._key[:],iv, ctx._impl)
@@ -151,11 +143,11 @@ seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {
 // dst and plaintext MUST alias exactly or not at all.
@(require_results)
 open :: proc(ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool {
+	ensure(ctx._is_initialized)
+
 	plaintext := dst
 	_validate_common_slice_sizes(tag, iv, aad, ciphertext, ctx._is_xchacha)
-	if len(ciphertext) != len(plaintext) {
-		panic("crypto/chacha20poly1305: invalid destination plaintext size")
-	}
+	ensure(len(ciphertext) == len(plaintext), "crypto/chacha20poly1305: invalid destination plaintext size")

 	// Note: Unlike encrypt, this can fail early, so use defer for
 	// sanitization rather than assuming control flow reaches certain
@@ -0,0 +1,280 @@
+/*
+package deoxysii implements the Deoxys-II-256 Authenticated Encryption
+with Additional Data algorithm.
+
+- [[ https://sites.google.com/view/deoxyscipher ]]
+- [[ https://thomaspeyrin.github.io/web/assets/docs/papers/Jean-etal-JoC2021.pdf ]]
+*/
+package deoxysii
+
+import "base:intrinsics"
+import "core:bytes"
+import "core:crypto/aes"
+import "core:mem"
+import "core:simd"
+
+// KEY_SIZE is the Deoxys-II-256 key size in bytes.
+KEY_SIZE :: 32
+// IV_SIZE iss the Deoxys-II-256 IV size in bytes.
+IV_SIZE :: 15 // 120-bits
+// TAG_SIZE is the Deoxys-II-256 tag size in bytes.
+TAG_SIZE :: 16
+
+@(private)
+PREFIX_AD_BLOCK :: 0b0010
+@(private)
+PREFIX_AD_FINAL :: 0b0110
+@(private)
+PREFIX_MSG_BLOCK :: 0b0000
+@(private)
+PREFIX_MSG_FINAL :: 0b0100
+@(private)
+PREFIX_TAG :: 0b0001
+@(private)
+PREFIX_SHIFT :: 4
+
+@(private)
+BC_ROUNDS :: 16
+@(private)
+BLOCK_SIZE :: aes.BLOCK_SIZE
+
+@(private = "file")
+_LFSR2_MASK :: simd.u8x16{
+	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+}
+@(private = "file")
+_LFSR3_MASK :: simd.u8x16{
+	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+}
+@(private = "file")
+_LFSR_SH1 :: _LFSR2_MASK
+@(private = "file")
+_LFSR_SH5 :: simd.u8x16{
+	0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+	0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+}
+@(private = "file")
+_LFSR_SH7 :: simd.u8x16{
+	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+}
+@(private = "file", rodata)
+_RCONS := []byte {
+	0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a,
+	0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39,
+	0x72,
+}
+
+// Context is a keyed Deoxys-II-256 instance.
+Context :: struct {
+	_subkeys:        [BC_ROUNDS+1][16]byte,
+	_impl:           aes.Implementation,
+	_is_initialized: bool,
+}
+
+@(private)
+_validate_common_slice_sizes :: proc (ctx: ^Context, tag, iv, aad, text: []byte) {
+	ensure(len(tag) == TAG_SIZE, "crypto/deoxysii: invalid tag size")
+	ensure(len(iv) == IV_SIZE, "crypto/deoxysii: invalid IV size")
+
+	#assert(size_of(int) == 8 || size_of(int) <= 4)
+	// For the nonce-misuse resistant mode, the total size of the
+	// associated data and the total size of the message do not exceed
+	// `16 * 2^max_l * 2^max_m bytes`, thus 2^128 bytes for all variants
+	// of Deoxys-II. Moreover, the maximum number of messages that can
+	// be handled for a same key is 2^max_m, that is 2^64 for all variants
+	// of Deoxys.
+}
+
+// init initializes a Context with the provided key.
+init :: proc(ctx: ^Context, key: []byte, impl := aes.DEFAULT_IMPLEMENTATION) {
+	ensure(len(key) == KEY_SIZE, "crypto/deoxysii: invalid key size")
+
+	ctx._impl = impl
+	if ctx._impl == .Hardware && !is_hardware_accelerated() {
+		ctx._impl = .Portable
+	}
+
+	derive_ks(ctx, key)
+
+	ctx._is_initialized = true
+}
+
+// seal encrypts the plaintext and authenticates the aad and ciphertext,
+// with the provided Context and iv, stores the output in dst and tag.
+//
+// dst and plaintext MUST alias exactly or not at all.
+seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {
+	ensure(ctx._is_initialized)
+
+	_validate_common_slice_sizes(ctx, tag, iv, aad, plaintext)
+	ensure(len(dst) == len(plaintext), "crypto/deoxysii: invalid destination ciphertext size")
+	ensure(!bytes.alias_inexactly(dst, plaintext), "crypto/deoxysii: dst and plaintext alias inexactly")
+
+	switch ctx._impl {
+	case .Hardware:
+		e_hw(ctx, dst, tag, iv, aad, plaintext)
+	case .Portable:
+		e_ref(ctx, dst, tag, iv, aad, plaintext)
+	}
+}
+
+// open authenticates the aad and ciphertext, and decrypts the ciphertext,
+// with the provided Context, iv, and tag, and stores the output in dst,
+// returning true iff the authentication was successful.  If authentication
+// fails, the destination buffer will be zeroed.
+//
+// dst and plaintext MUST alias exactly or not at all.
+@(require_results)
+open :: proc(ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool {
+	ensure(ctx._is_initialized)
+
+	_validate_common_slice_sizes(ctx, tag, iv, aad, ciphertext)
+	ensure(len(dst) == len(ciphertext), "crypto/deoxysii: invalid destination plaintext size")
+	ensure(!bytes.alias_inexactly(dst, ciphertext), "crypto/deoxysii: dst and ciphertext alias inexactly")
+
+	ok: bool
+	switch ctx._impl {
+	case .Hardware:
+		ok = d_hw(ctx, dst, iv, aad, ciphertext, tag)
+	case .Portable:
+		ok = d_ref(ctx, dst, iv, aad, ciphertext, tag)
+	}
+	if !ok {
+		mem.zero_explicit(raw_data(dst), len(ciphertext))
+	}
+
+	return ok
+}
+
+// reset sanitizes the Context.  The Context must be
+// re-initialized to be used again.
+reset :: proc "contextless" (ctx: ^Context) {
+	mem.zero_explicit(&ctx._subkeys, len(ctx._subkeys))
+	ctx._is_initialized = false
+}
+
+@(private = "file")
+derive_ks :: proc "contextless" (ctx: ^Context, key: []byte) {
+	// Derive the constant component of each subtweakkey.
+	//
+	// The key schedule is as thus:
+	//
+	//   STK_i = TK1_i ^ TK2_i ^ TK3_i ^ RC_i
+	//
+	//   TK1_i = h(TK1_(i-1))
+	//   TK2_i = h(LFSR2(TK2_(i-1)))
+	//   TK3_i = h(LFSR3(TK2_(i-1)))
+	//
+	// where:
+	//
+	//   KT = K || T
+	//   W3 = KT[:16]
+	//   W2 = KT[16:32]
+	//   W1 = KT[32:]
+	//
+	//   TK1_0 = W1
+	//   TK2_0 = W2
+	//   TK3_0 = W3
+	//
+	// As `K` is fixed per Context, the XORs of `TK3_0 .. TK3_n`,
+	// `TK2_0 .. TK2_n` and RC_i can be precomputed in advance like
+	// thus:
+	//
+	//   subkey_i = TK3_i ^ TK2_i ^ RC_i
+	//
+	// When it is time to actually call Deoxys-BC-384, it is then
+	// a simple matter of deriving each round subtweakkey via:
+	//
+	//   TK1_0 = T (Tweak)
+	//   STK_0 = subkey_0 ^ TK1_0
+	//   STK_i = subkey_i (precomputed) ^ H(TK1_(i-1))
+	//
+	// We opt to use SIMD here and for the subtweakkey deriviation
+	// as `H()` is typically a single vector instruction.
+
+	tk2 := intrinsics.unaligned_load((^simd.u8x16)(raw_data(key[16:])))
+	tk3 := intrinsics.unaligned_load((^simd.u8x16)(raw_data(key)))
+
+	// subkey_0 does not apply LFSR2/3 or H.
+	intrinsics.unaligned_store(
+		(^simd.u8x16)(&ctx._subkeys[0]),
+		simd.bit_xor(
+			tk2,
+			simd.bit_xor(
+				tk3,
+				rcon(0),
+			),
+		),
+	)
+
+	// Precompute k_1 .. k_16.
+	for i in 1 ..< BC_ROUNDS+1 {
+		tk2 = h(lfsr2(tk2))
+		tk3 = h(lfsr3(tk3))
+		intrinsics.unaligned_store(
+			(^simd.u8x16)(&ctx._subkeys[i]),
+			simd.bit_xor(
+				tk2,
+				simd.bit_xor(
+					tk3,
+					rcon(i),
+				),
+			),
+		)
+	}
+}
+
+@(private = "file")
+lfsr2 :: #force_inline proc "contextless" (tk: simd.u8x16) -> simd.u8x16 {
+	// LFSR2 is a application of the following LFSR to each byte of input.
+	// (x7||x6||x5||x4||x3||x2||x1||x0) -> (x6||x5||x4||x3||x2||x1||x0||x7 ^ x5)
+	return simd.bit_or(
+		simd.shl(tk, _LFSR_SH1),
+		simd.bit_and(
+			simd.bit_xor(
+				simd.shr(tk, _LFSR_SH7), // x7
+				simd.shr(tk, _LFSR_SH5), // x5
+			),
+			_LFSR2_MASK,
+		),
+	)
+}
+
+@(private = "file")
+lfsr3 :: #force_inline proc "contextless"  (tk: simd.u8x16) -> simd.u8x16 {
+	// LFSR3 is a application of the following LFSR to each byte of input.
+	// (x7||x6||x5||x4||x3||x2||x1||x0) -> (x0 ^ x6||x7||x6||x5||x4||x3||x2||x1)
+	return simd.bit_or(
+		simd.shr(tk, _LFSR_SH1),
+		simd.bit_and(
+			simd.bit_xor(
+				simd.shl(tk, _LFSR_SH7), // x0
+				simd.shl(tk, _LFSR_SH1), // x6
+			),
+			_LFSR3_MASK,
+		),
+	)
+}
+
+@(private)
+h :: #force_inline proc "contextless" (tk: simd.u8x16) -> simd.u8x16 {
+	return simd.swizzle(
+		tk,
+		0x01, 0x06, 0x0b, 0x0c, 0x05, 0x0a, 0x0f, 0x00,
+		0x09, 0x0e, 0x03, 0x04, 0x0d, 0x02, 0x07, 0x08,
+	)
+}
+
+@(private = "file")
+rcon :: #force_inline proc "contextless" (rd: int) -> simd.u8x16 #no_bounds_check {
+	rc := _RCONS[rd]
+	return simd.u8x16{
+		1, 2, 4, 8,
+		rc, rc, rc, rc,
+		0, 0, 0, 0,
+		0, 0, 0, 0,
+	}
+}
@@ -0,0 +1,399 @@
+package deoxysii
+
+import "base:intrinsics"
+import "core:crypto"
+import aes "core:crypto/_aes/ct64"
+import "core:encoding/endian"
+import "core:mem"
+import "core:simd"
+
+// This uses the bitlsiced 64-bit general purpose register SWAR AES
+// round function.  The encryption pass skips orthogonalizing the
+// AES round function input as it is aways going to be the leading 0
+// padded IV, and doing a 64-byte copy is faster.
+
+@(private = "file")
+TWEAK_SIZE :: 16
+
+@(private = "file")
+State_SW :: struct {
+	ctx:        ^Context,
+	q_stk, q_b: [8]u64,
+}
+
+@(private = "file")
+auth_tweak :: #force_inline proc "contextless" (
+	dst: ^[TWEAK_SIZE]byte,
+	prefix: byte,
+	block_nr: int,
+) {
+	endian.unchecked_put_u64be(dst[8:], u64(block_nr))
+	endian.unchecked_put_u64le(dst[0:], u64(prefix) << PREFIX_SHIFT) // dst[0] = prefix << PREFIX_SHIFT
+}
+
+@(private = "file")
+enc_tweak :: #force_inline proc "contextless" (
+	dst: ^[TWEAK_SIZE]byte,
+	tag: ^[TAG_SIZE]byte,
+	block_nr: int,
+) {
+	tmp: [8]byte
+	endian.unchecked_put_u64be(tmp[:], u64(block_nr))
+
+	copy(dst[:], tag[:])
+	dst[0] |= 0x80
+	for i in 0 ..< 8 {
+		dst[i+8] ~= tmp[i]
+	}
+}
+
+@(private = "file")
+enc_plaintext :: #force_inline proc "contextless" (
+	dst: ^[8]u64,
+	iv:  []byte,
+) {
+	tmp: [BLOCK_SIZE]byte = ---
+	tmp[0] = 0
+	copy(tmp[1:], iv[:])
+
+	q_0, q_1 := aes.load_interleaved(tmp[:])
+	for i in 0 ..< 4 {
+		dst[i], dst[i+4] = q_0, q_1
+	}
+	aes.orthogonalize(dst)
+}
+
+@(private = "file")
+bc_x4 :: proc "contextless" (
+	ctx:     ^Context,
+	dst:     []byte,
+	tweaks:  ^[4][TWEAK_SIZE]byte,
+	q_stk:   ^[8]u64,
+	q_b:     ^[8]u64, // Orthogonalized
+	n:       int,
+) {
+	tk1s: [4]simd.u8x16
+	for j in 0 ..< n {
+		tk1s[j] = intrinsics.unaligned_load((^simd.u8x16)(&tweaks[j]))
+	}
+
+	// Deoxys-BC-384
+	for i in 0 ..= BC_ROUNDS {
+		// Derive the round's subtweakkey
+		sk := intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[i]))
+		for j in 0 ..< n {
+			if i != 0 {
+				tk1s[j] = h(tk1s[j])
+			}
+			intrinsics.unaligned_store(
+				(^simd.u8x16)(raw_data(dst)),
+				simd.bit_xor(sk, tk1s[j]),
+			)
+			q_stk[j], q_stk[j+4] = aes.load_interleaved(dst[:])
+		}
+		aes.orthogonalize(q_stk)
+
+		if i != 0 {
+			aes.sub_bytes(q_b)
+			aes.shift_rows(q_b)
+			aes.mix_columns(q_b)
+		}
+		aes.add_round_key(q_b, q_stk[:])
+	}
+
+	aes.orthogonalize(q_b)
+	for i in 0 ..< n {
+		aes.store_interleaved(dst[i*BLOCK_SIZE:], q_b[i], q_b[i+4])
+	}
+}
+
+@(private = "file", require_results)
+bc_absorb :: proc "contextless" (
+	st:           ^State_SW,
+	dst:          []byte,
+	src:          []byte,
+	tweak_prefix: byte,
+	stk_block_nr: int,
+) -> int {
+	tweaks: [4][TWEAK_SIZE]byte = ---
+	tmp: [BLOCK_SIZE*4]byte = ---
+
+	src, stk_block_nr := src, stk_block_nr
+	dst_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(dst)))
+
+	nr_blocks := len(src) / BLOCK_SIZE
+	for nr_blocks > 0 {
+		// Derive the tweak(s), orthogonalize the plaintext
+		n := min(nr_blocks, 4)
+		for i in 0 ..< n {
+			auth_tweak(&tweaks[i], tweak_prefix, stk_block_nr + i)
+			st.q_b[i], st.q_b[i + 4] = aes.load_interleaved(src)
+			src = src[BLOCK_SIZE:]
+		}
+		aes.orthogonalize(&st.q_b)
+
+		// Deoxys-BC-384
+		bc_x4(st.ctx, tmp[:], &tweaks, &st.q_stk, &st.q_b, n)
+
+		// XOR in the existing Auth/tag
+		for i in 0 ..< n {
+			dst_ = simd.bit_xor(
+				dst_,
+				intrinsics.unaligned_load((^simd.u8x16)(raw_data(tmp[i*BLOCK_SIZE:]))),
+			)
+		}
+
+		stk_block_nr += n
+		nr_blocks -= n
+	}
+
+	intrinsics.unaligned_store((^simd.u8x16)(raw_data(dst)), dst_)
+
+	mem.zero_explicit(&tweaks, size_of(tweaks))
+	mem.zero_explicit(&tmp, size_of(tmp))
+
+	return stk_block_nr
+}
+
+@(private = "file")
+bc_final :: proc "contextless" (
+	st:  ^State_SW,
+	dst: []byte,
+	iv:  []byte,
+) {
+	tweaks: [4][TWEAK_SIZE]byte = ---
+
+	tweaks[0][0] = PREFIX_TAG << PREFIX_SHIFT
+	copy(tweaks[0][1:], iv)
+
+	st.q_b[0], st.q_b[4] = aes.load_interleaved(dst)
+	aes.orthogonalize(&st.q_b)
+
+	bc_x4(st.ctx, dst, &tweaks, &st.q_stk, &st.q_b, 1)
+}
+
+@(private = "file", require_results)
+bc_encrypt :: proc "contextless" (
+	st:           ^State_SW,
+	dst:          []byte,
+	src:          []byte,
+	q_n:          ^[8]u64, // Orthogonalized
+	tweak_tag:    ^[TAG_SIZE]byte,
+	stk_block_nr: int,
+) -> int {
+	tweaks: [4][TWEAK_SIZE]byte = ---
+	tmp: [BLOCK_SIZE*4]byte = ---
+
+	dst, src, stk_block_nr := dst, src, stk_block_nr
+
+	nr_blocks := len(src) / BLOCK_SIZE
+	for nr_blocks > 0 {
+		// Derive the tweak(s)
+		n := min(nr_blocks, 4)
+		for i in 0 ..< n {
+			enc_tweak(&tweaks[i], tweak_tag, stk_block_nr + i)
+		}
+		st.q_b = q_n^ // The plaintext is always `0^8 || N`
+
+		// Deoxys-BC-384
+		bc_x4(st.ctx, tmp[:], &tweaks, &st.q_stk, &st.q_b, n)
+
+		// XOR the ciphertext
+		for i in 0 ..< n {
+			intrinsics.unaligned_store(
+				(^simd.u8x16)(raw_data(dst[i*BLOCK_SIZE:])),
+				simd.bit_xor(
+					intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[i*BLOCK_SIZE:]))),
+					intrinsics.unaligned_load((^simd.u8x16)(raw_data(tmp[i*BLOCK_SIZE:]))),
+				),
+			)
+		}
+
+		dst, src = dst[n*BLOCK_SIZE:], src[n*BLOCK_SIZE:]
+		stk_block_nr += n
+		nr_blocks -= n
+	}
+
+	mem.zero_explicit(&tweaks, size_of(tweaks))
+	mem.zero_explicit(&tmp, size_of(tmp))
+
+	return stk_block_nr
+}
+
+@(private)
+e_ref :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) #no_bounds_check {
+	st: State_SW = ---
+	st.ctx = ctx
+
+	// Algorithm 3
+	//
+	// Associated data
+	// A_1 || ... || A_la || A_∗ <- A where each |A_i| = n and |A_∗| < n
+	// Auth <- 0^n
+	// for i = 0 to la − 1 do
+	//   Auth <- Auth ^ EK(0010 || i, A_i+1)
+	// end
+	// if A_∗ != nil then
+	//   Auth <- Auth ^ EK(0110 || la, pad10∗(A_∗))
+	// end
+	auth: [TAG_SIZE]byte
+	aad := aad
+	n := bc_absorb(&st, auth[:], aad, PREFIX_AD_BLOCK, 0)
+	aad = aad[n*BLOCK_SIZE:]
+	if l := len(aad); l > 0 {
+		a_star: [BLOCK_SIZE]byte
+
+		copy(a_star[:], aad)
+		a_star[l] = 0x80
+
+		_ = bc_absorb(&st, auth[:], a_star[:], PREFIX_AD_FINAL, n)
+	}
+
+	// Message authentication and tag generation
+	// M_1 || ... || M_l || M_∗ <- M where each |M_j| = n and |M_∗| < n
+	// tag <- Auth
+	// for j = 0 to l − 1 do
+	//   tag <- tag ^ EK(0000 || j, M_j+1)
+	// end
+	// if M_∗ != nil then
+	//   tag <- tag ^ EK(0100 || l, pad10∗(M_∗))
+	// end
+	// tag <- EK(0001 || 0^4 || N, tag)
+	m := plaintext
+	n = bc_absorb(&st, auth[:], m, PREFIX_MSG_BLOCK, 0)
+	m = m[n*BLOCK_SIZE:]
+	if l := len(m); l > 0 {
+		m_star: [BLOCK_SIZE]byte
+
+		copy(m_star[:], m)
+		m_star[l] = 0x80
+
+		_ = bc_absorb(&st, auth[:], m_star[:], PREFIX_MSG_FINAL, n)
+	}
+	bc_final(&st, auth[:], iv)
+
+	// Message encryption
+	// for j = 0 to l − 1 do
+	//   C_j <- M_j ^ EK(1 || tag ^ j, 0^8 || N)
+	// end
+	// if M_∗ != nil then
+	//   C_∗ <- M_* ^ EK(1 || tag ^ l, 0^8 || N)
+	// end
+	//
+	// return (C_1 || ... || C_l || C_∗, tag)
+	q_iv: [8]u64 = ---
+	enc_plaintext(&q_iv, iv)
+
+	m = plaintext
+	n = bc_encrypt(&st, dst, m, &q_iv, &auth, 0)
+	m = m[n*BLOCK_SIZE:]
+	if l := len(m); l > 0 {
+		m_star: [BLOCK_SIZE]byte
+
+		copy(m_star[:], m)
+		_ = bc_encrypt(&st, m_star[:], m_star[:], &q_iv, &auth, n)
+
+		copy(dst[n*BLOCK_SIZE:], m_star[:])
+
+		mem.zero_explicit(&m_star, size_of(m_star))
+	}
+
+	copy(tag, auth[:])
+
+	mem.zero_explicit(&st.q_stk, size_of(st.q_stk))
+	mem.zero_explicit(&st.q_b, size_of(st.q_b))
+}
+
+@(private, require_results)
+d_ref :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool {
+	st: State_SW = ---
+	st.ctx = ctx
+
+	// Algorithm 4
+	//
+	// Message decryption
+	// C_1 || ... || C_l || C_∗ <- C where each |C_j| = n and |C_∗| < n
+	// for j = 0 to l − 1 do
+	//   M_j <- C_j ^ EK(1 || tag ^ j, 0^8 || N)
+	// end
+	// if C_∗ != nil then
+	//   M_∗ <- C_∗ ^ EK(1 || tag ^ l, 0^8 || N)
+	// end
+	q_iv: [8]u64 = ---
+	enc_plaintext(&q_iv, iv)
+
+	auth: [TAG_SIZE]byte
+	copy(auth[:], tag)
+
+	m := ciphertext
+	n := bc_encrypt(&st, dst, m, &q_iv, &auth, 0)
+	m = m[n*BLOCK_SIZE:]
+	if l := len(m); l > 0 {
+		m_star: [BLOCK_SIZE]byte
+
+		copy(m_star[:], m)
+		_ = bc_encrypt(&st, m_star[:], m_star[:], &q_iv, &auth, n)
+
+		copy(dst[n*BLOCK_SIZE:], m_star[:])
+
+		mem.zero_explicit(&m_star, size_of(m_star))
+	}
+
+	// Associated data
+	// A_1 || ... || Al_a || A_∗ <- A where each |Ai_| = n and |A_∗| < n
+	// Auth <- 0
+	// for i = 0 to la − 1 do
+	//   Auth <- Auth ^ EK(0010 || i, A_i+1)
+	// end
+	// if A∗ != nil then
+	//   Auth <- Auth ^ EK(0110| | l_a, pad10∗(A_∗))
+	// end
+	auth = 0
+	aad := aad
+	n = bc_absorb(&st, auth[:], aad, PREFIX_AD_BLOCK, 0)
+	aad = aad[n*BLOCK_SIZE:]
+	if l := len(aad); l > 0 {
+		a_star: [BLOCK_SIZE]byte
+
+		copy(a_star[:], aad)
+		a_star[l] = 0x80
+
+		_ = bc_absorb(&st, auth[:], a_star[:], PREFIX_AD_FINAL, n)
+	}
+
+	// Message authentication and tag generation
+	// M_1 || ... || M_l || M_∗ <- M where each |M_j| = n and |M_∗| < n
+	// tag0 <- Auth
+	// for j = 0 to l − 1 do
+	//   tag0 <- tag0 ^ EK(0000 || j, M_j+1)
+	// end
+	// if M_∗ != nil then
+	//   tag0 <- tag0 ^ EK(0100 || l, pad10∗(M_∗))
+	// end
+	// tag0 <- EK(0001 || 0^4 || N, tag0)
+	m = dst[:len(ciphertext)]
+	n = bc_absorb(&st, auth[:], m, PREFIX_MSG_BLOCK, 0)
+	m = m[n*BLOCK_SIZE:]
+	if l := len(m); l > 0 {
+		m_star: [BLOCK_SIZE]byte
+
+		copy(m_star[:], m)
+		m_star[l] = 0x80
+
+		_ = bc_absorb(&st, auth[:], m_star[:], PREFIX_MSG_FINAL, n)
+
+		mem.zero_explicit(&m_star, size_of(m_star))
+	}
+	bc_final(&st, auth[:], iv)
+
+	// Tag verification
+	// if tag0 = tag then return (M_1 || ... || M_l || M_∗)
+	// else return false
+	ok := crypto.compare_constant_time(auth[:], tag) == 1
+
+	mem.zero_explicit(&auth, size_of(auth))
+	mem.zero_explicit(&st.q_stk, size_of(st.q_stk))
+	mem.zero_explicit(&st.q_b, size_of(st.q_b))
+
+	return ok
+}
@@ -0,0 +1,21 @@
+#+build !amd64
+package deoxysii
+
+@(private = "file")
+ERR_HW_NOT_SUPPORTED :: "crypto/deoxysii: hardware implementation unsupported"
+
+// is_hardware_accelerated returns true iff hardware accelerated Deoxys-II
+// is supported.
+is_hardware_accelerated :: proc "contextless" () -> bool {
+	return false
+}
+
+@(private)
+e_hw :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) #no_bounds_check {
+	panic_contextless(ERR_HW_NOT_SUPPORTED)
+}
+
+@(private, require_results)
+d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool {
+	panic_contextless(ERR_HW_NOT_SUPPORTED)
+}
@@ -0,0 +1,434 @@
+#+build amd64
+package deoxysii
+
+import "base:intrinsics"
+import "core:crypto"
+import "core:crypto/aes"
+import "core:mem"
+import "core:simd"
+import "core:simd/x86"
+
+// This processes a maximum of 4 blocks at a time, as that is suitable
+// for most current hardware that doesn't say "Xeon".
+
+@(private = "file")
+_BIT_ENC :: x86.__m128i{0x80, 0}
+@(private = "file")
+_PREFIX_AD_BLOCK :: x86.__m128i{PREFIX_AD_BLOCK << PREFIX_SHIFT, 0}
+@(private = "file")
+_PREFIX_AD_FINAL :: x86.__m128i{PREFIX_AD_FINAL << PREFIX_SHIFT, 0}
+@(private = "file")
+_PREFIX_MSG_BLOCK :: x86.__m128i{PREFIX_MSG_BLOCK << PREFIX_SHIFT, 0}
+@(private = "file")
+_PREFIX_MSG_FINAL :: x86.__m128i{PREFIX_MSG_FINAL << PREFIX_SHIFT, 0}
+
+// is_hardware_accelerated returns true iff hardware accelerated Deoxys-II
+// is supported.
+is_hardware_accelerated :: proc "contextless" () -> bool {
+	return aes.is_hardware_accelerated()
+}
+
+@(private = "file", enable_target_feature = "sse4.1", require_results)
+auth_tweak :: #force_inline proc "contextless" (
+	prefix:   x86.__m128i,
+	block_nr: int,
+) -> x86.__m128i {
+	return x86._mm_insert_epi64(prefix, i64(intrinsics.byte_swap(u64(block_nr))), 1)
+}
+
+@(private = "file", enable_target_feature = "sse2", require_results)
+enc_tweak :: #force_inline proc "contextless" (
+	tag:      x86.__m128i,
+	block_nr: int,
+) -> x86.__m128i {
+	return x86._mm_xor_si128(
+		x86._mm_or_si128(tag, _BIT_ENC),
+		x86.__m128i{0, i64(intrinsics.byte_swap(u64(block_nr)))},
+	)
+}
+
+@(private = "file", enable_target_feature = "ssse3", require_results)
+h_ :: #force_inline proc "contextless" (tk1: x86.__m128i) -> x86.__m128i {
+	return transmute(x86.__m128i)h(transmute(simd.u8x16)tk1)
+}
+
+@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
+bc_x4 :: #force_inline proc "contextless" (
+	ctx: ^Context,
+	s_0, s_1, s_2, s_3:                 x86.__m128i,
+	tweak_0, tweak_1, tweak_2, tweak_3: x86.__m128i,
+) -> (x86.__m128i, x86.__m128i, x86.__m128i, x86.__m128i) #no_bounds_check {
+	s_0, s_1, s_2, s_3 := s_0, s_1, s_2, s_3
+	tk1_0, tk1_1, tk1_2, tk1_3 := tweak_0, tweak_1, tweak_2, tweak_3
+
+	sk := intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[0]))
+	stk_0 := x86._mm_xor_si128(tk1_0, sk)
+	stk_1 := x86._mm_xor_si128(tk1_1, sk)
+	stk_2 := x86._mm_xor_si128(tk1_2, sk)
+	stk_3 := x86._mm_xor_si128(tk1_3, sk)
+
+	s_0 = x86._mm_xor_si128(s_0, stk_0)
+	s_1 = x86._mm_xor_si128(s_1, stk_1)
+	s_2 = x86._mm_xor_si128(s_2, stk_2)
+	s_3 = x86._mm_xor_si128(s_3, stk_3)
+
+	for i in 1 ..= BC_ROUNDS {
+		sk = intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[i]))
+
+		tk1_0 = h_(tk1_0)
+		tk1_1 = h_(tk1_1)
+		tk1_2 = h_(tk1_2)
+		tk1_3 = h_(tk1_3)
+
+		stk_0 = x86._mm_xor_si128(tk1_0, sk)
+		stk_1 = x86._mm_xor_si128(tk1_1, sk)
+		stk_2 = x86._mm_xor_si128(tk1_2, sk)
+		stk_3 = x86._mm_xor_si128(tk1_3, sk)
+
+		s_0 = x86._mm_aesenc_si128(s_0, stk_0)
+		s_1 = x86._mm_aesenc_si128(s_1, stk_1)
+		s_2 = x86._mm_aesenc_si128(s_2, stk_2)
+		s_3 = x86._mm_aesenc_si128(s_3, stk_3)
+	}
+
+	return s_0, s_1, s_2, s_3
+}
+
+@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
+bc_x1 :: #force_inline proc "contextless" (
+	ctx:   ^Context,
+	s:     x86.__m128i,
+	tweak: x86.__m128i,
+) -> x86.__m128i #no_bounds_check {
+	s, tk1 := s, tweak
+
+	sk := intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[0]))
+	stk := x86._mm_xor_si128(tk1, sk)
+
+	s = x86._mm_xor_si128(s, stk)
+
+	for i in 1 ..= BC_ROUNDS {
+		sk = intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[i]))
+
+		tk1 = h_(tk1)
+
+		stk = x86._mm_xor_si128(tk1, sk)
+
+		s = x86._mm_aesenc_si128(s, stk)
+	}
+
+	return s
+}
+
+@(private = "file", enable_target_feature = "sse2,ssse3,sse4.1,aes", require_results)
+bc_absorb :: proc "contextless" (
+	ctx:          ^Context,
+	tag:          x86.__m128i,
+	src:          []byte,
+	tweak_prefix: x86.__m128i,
+	stk_block_nr: int,
+) -> (x86.__m128i, int) #no_bounds_check {
+	src, stk_block_nr, tag := src, stk_block_nr, tag
+
+	nr_blocks := len(src) / BLOCK_SIZE
+	for nr_blocks >= 4 {
+		d_0, d_1, d_2, d_3 := bc_x4(
+			ctx,
+			intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
+			intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[BLOCK_SIZE:]))),
+			intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[2*BLOCK_SIZE:]))),
+			intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[3*BLOCK_SIZE:]))),
+			auth_tweak(tweak_prefix, stk_block_nr),
+			auth_tweak(tweak_prefix, stk_block_nr + 1),
+			auth_tweak(tweak_prefix, stk_block_nr + 2),
+			auth_tweak(tweak_prefix, stk_block_nr + 3),
+		)
+
+		tag = x86._mm_xor_si128(tag, d_0)
+		tag = x86._mm_xor_si128(tag, d_1)
+		tag = x86._mm_xor_si128(tag, d_2)
+		tag = x86._mm_xor_si128(tag, d_3)
+
+		src = src[4*BLOCK_SIZE:]
+		stk_block_nr += 4
+		nr_blocks -= 4
+	}
+
+	for nr_blocks > 0 {
+		d := bc_x1(
+			ctx,
+			intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
+			auth_tweak(tweak_prefix, stk_block_nr),
+		)
+
+		tag = x86._mm_xor_si128(tag, d)
+
+		src = src[BLOCK_SIZE:]
+		stk_block_nr += 1
+		nr_blocks -= 1
+	}
+
+	return tag, stk_block_nr
+}
+
+@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
+bc_final :: proc "contextless" (
+	ctx: ^Context,
+	tag: x86.__m128i,
+	iv:  []byte,
+) -> x86.__m128i {
+	tmp: [BLOCK_SIZE]byte
+
+	tmp[0] = PREFIX_TAG << PREFIX_SHIFT
+	copy(tmp[1:], iv)
+
+	tweak := intrinsics.unaligned_load((^x86.__m128i)(&tmp))
+
+	return bc_x1(ctx, tag, tweak)
+}
+
+@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results)
+bc_encrypt :: proc "contextless" (
+	ctx:          ^Context,
+	dst:          []byte,
+	src:          []byte,
+	iv:           x86.__m128i,
+	tweak_tag:    x86.__m128i,
+	stk_block_nr: int,
+) -> int {
+	dst, src, stk_block_nr := dst, src, stk_block_nr
+
+	nr_blocks := len(src) / BLOCK_SIZE
+	for nr_blocks >= 4 {
+		d_0, d_1, d_2, d_3 := bc_x4(
+			ctx,
+			iv, iv, iv, iv,
+			enc_tweak(tweak_tag, stk_block_nr),
+			enc_tweak(tweak_tag, stk_block_nr + 1),
+			enc_tweak(tweak_tag, stk_block_nr + 2),
+			enc_tweak(tweak_tag, stk_block_nr + 3),
+		)
+
+		intrinsics.unaligned_store(
+			(^x86.__m128i)(raw_data(dst)),
+			x86._mm_xor_si128(
+				d_0,
+				intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
+			),
+		)
+		intrinsics.unaligned_store(
+			(^x86.__m128i)(raw_data(dst[BLOCK_SIZE:])),
+			x86._mm_xor_si128(
+				d_1,
+				intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[BLOCK_SIZE:]))),
+			),
+		)
+		intrinsics.unaligned_store(
+			(^x86.__m128i)(raw_data(dst[2*BLOCK_SIZE:])),
+			x86._mm_xor_si128(
+				d_2,
+				intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[2*BLOCK_SIZE:]))),
+			),
+		)
+		intrinsics.unaligned_store(
+			(^x86.__m128i)(raw_data(dst[3*BLOCK_SIZE:])),
+			x86._mm_xor_si128(
+				d_3,
+				intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[3*BLOCK_SIZE:]))),
+			),
+		)
+
+		src, dst = src[4*BLOCK_SIZE:], dst[4*BLOCK_SIZE:]
+		stk_block_nr += 4
+		nr_blocks -= 4
+	}
+
+	for nr_blocks > 0 {
+		d := bc_x1(
+			ctx,
+			iv,
+			enc_tweak(tweak_tag, stk_block_nr),
+		)
+
+		intrinsics.unaligned_store(
+			(^x86.__m128i)(raw_data(dst)),
+			x86._mm_xor_si128(
+				d,
+				intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))),
+			),
+		)
+
+		src, dst = src[BLOCK_SIZE:], dst[BLOCK_SIZE:]
+		stk_block_nr += 1
+		nr_blocks -= 1
+	}
+
+	return stk_block_nr
+}
+
+@(private)
+e_hw :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) #no_bounds_check {
+	tmp: [BLOCK_SIZE]byte
+	copy(tmp[1:], iv)
+	iv_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(&tmp)))
+
+	// Algorithm 3
+	//
+	// Associated data
+	// A_1 || ... || A_la || A_∗ <- A where each |A_i| = n and |A_∗| < n
+	// Auth <- 0^n
+	// for i = 0 to la − 1 do
+	//   Auth <- Auth ^ EK(0010 || i, A_i+1)
+	// end
+	// if A_∗ != nil then
+	//   Auth <- Auth ^ EK(0110 || la, pad10∗(A_∗))
+	// end
+	auth: x86.__m128i
+	n: int
+
+	aad := aad
+	auth, n = bc_absorb(ctx, auth, aad, _PREFIX_AD_BLOCK, 0)
+	aad = aad[n*BLOCK_SIZE:]
+	if l := len(aad); l > 0 {
+		a_star: [BLOCK_SIZE]byte
+
+		copy(a_star[:], aad)
+		a_star[l] = 0x80
+
+		auth, _ = bc_absorb(ctx, auth, a_star[:], _PREFIX_AD_FINAL, n)
+	}
+
+	// Message authentication and tag generation
+	// M_1 || ... || M_l || M_∗ <- M where each |M_j| = n and |M_∗| < n
+	// tag <- Auth
+	// for j = 0 to l − 1 do
+	//   tag <- tag ^ EK(0000 || j, M_j+1)
+	// end
+	// if M_∗ != nil then
+	//   tag <- tag ^ EK(0100 || l, pad10∗(M_∗))
+	// end
+	// tag <- EK(0001 || 0^4 ||N, tag)
+	m := plaintext
+	auth, n = bc_absorb(ctx, auth, m, _PREFIX_MSG_BLOCK, 0)
+	m = m[n*BLOCK_SIZE:]
+	if l := len(m); l > 0 {
+		m_star: [BLOCK_SIZE]byte
+
+		copy(m_star[:], m)
+		m_star[l] = 0x80
+
+		auth, _ = bc_absorb(ctx, auth, m_star[:], _PREFIX_MSG_FINAL, n)
+	}
+	auth = bc_final(ctx, auth, iv)
+
+	// Message encryption
+	// for j = 0 to l − 1 do
+	//   C_j <- M_j ^ EK(1 || tag ^ j, 0^8 || N)
+	// end
+	// if M_∗ != nil then
+	//   C_∗ <- M_* ^ EK(1 || tag ^ l, 0^8 || N)
+	// end
+	//
+	// return (C_1 || ... || C_l || C_∗, tag)
+	m = plaintext
+	n = bc_encrypt(ctx, dst, m, iv_, auth, 0)
+	m = m[n*BLOCK_SIZE:]
+	if l := len(m); l > 0 {
+		m_star: [BLOCK_SIZE]byte
+
+		copy(m_star[:], m)
+		_ = bc_encrypt(ctx, m_star[:], m_star[:], iv_, auth, n)
+
+		copy(dst[n*BLOCK_SIZE:], m_star[:])
+	}
+
+	intrinsics.unaligned_store((^x86.__m128i)(raw_data(tag)), auth)
+}
+
+@(private, require_results)
+d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool {
+	tmp: [BLOCK_SIZE]byte
+	copy(tmp[1:], iv)
+	iv_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(&tmp)))
+
+	// Algorithm 4
+	//
+	// Message decryption
+	// C_1 || ... || C_l || C_∗ <- C where each |C_j| = n and |C_∗| < n
+	// for j = 0 to l − 1 do
+	//   M_j <- C_j ^ EK(1 || tag ^ j, 0^8 || N)
+	// end
+	// if C_∗ != nil then
+	//   M_∗ <- C_∗ ^ EK(1 || tag ^ l, 0^8 || N)
+	// end
+	auth := intrinsics.unaligned_load((^x86.__m128i)(raw_data(tag)))
+
+	m := ciphertext
+	n := bc_encrypt(ctx, dst, m, iv_, auth, 0)
+	m = m[n*BLOCK_SIZE:]
+	if l := len(m); l > 0 {
+		m_star: [BLOCK_SIZE]byte
+
+		copy(m_star[:], m)
+		_ = bc_encrypt(ctx, m_star[:], m_star[:], iv_, auth, n)
+
+		copy(dst[n*BLOCK_SIZE:], m_star[:])
+
+		mem.zero_explicit(&m_star, size_of(m_star))
+	}
+
+	// Associated data
+	// A_1 || ... || Al_a || A_∗ <- A where each |Ai_| = n and |A_∗| < n
+	// Auth <- 0
+	// for i = 0 to la − 1 do
+	//   Auth <- Auth ^ EK(0010 || i, A_i+1)
+	// end
+	// if A∗ != nil then
+	//   Auth <- Auth ^ EK(0110| | l_a, pad10∗(A_∗))
+	// end
+	auth = x86.__m128i{0, 0}
+	aad := aad
+	auth, n = bc_absorb(ctx, auth, aad, _PREFIX_AD_BLOCK, 0)
+	aad = aad[BLOCK_SIZE*n:]
+	if l := len(aad); l > 0 {
+		a_star: [BLOCK_SIZE]byte
+
+		copy(a_star[:], aad)
+		a_star[l] = 0x80
+
+		auth, _ = bc_absorb(ctx, auth, a_star[:], _PREFIX_AD_FINAL, n)
+	}
+
+	// Message authentication and tag generation
+	// M_1 || ... || M_l || M_∗ <- M where each |M_j| = n and |M_∗| < n
+	// tag0 <- Auth
+	// for j = 0 to l − 1 do
+	//   tag0 <- tag0 ^ EK(0000 || j, M_j+1)
+	// end
+	// if M_∗ != nil then
+	//   tag0 <- tag0 ^ EK(0100 || l, pad10∗(M_∗))
+	// end
+	// tag0 <- EK(0001 || 0^4 || N, tag0)
+	m = dst[:len(ciphertext)]
+	auth, n = bc_absorb(ctx, auth, m, _PREFIX_MSG_BLOCK, 0)
+	m = m[n*BLOCK_SIZE:]
+	if l := len(m); l > 0 {
+		m_star: [BLOCK_SIZE]byte
+
+		copy(m_star[:], m)
+		m_star[l] = 0x80
+
+		auth, _ = bc_absorb(ctx, auth, m_star[:], _PREFIX_MSG_FINAL, n)
+	}
+	auth = bc_final(ctx, auth, iv)
+
+	// Tag verification
+	// if tag0 = tag then return (M_1 || ... || M_l || M_∗)
+	// else return false
+	intrinsics.unaligned_store((^x86.__m128i)(raw_data(&tmp)), auth)
+	ok := crypto.compare_constant_time(tmp[:], tag) == 1
+
+	mem.zero_explicit(&tmp, size_of(tmp))
+
+	return ok
+}
@@ -81,12 +81,8 @@ private_key_set_bytes :: proc(priv_key: ^Private_Key, b: []byte) -> bool {

 // private_key_bytes sets dst to byte-encoding of priv_key.
 private_key_bytes :: proc(priv_key: ^Private_Key, dst: []byte) {
-	if !priv_key._is_initialized {
-		panic("crypto/ed25519: uninitialized private key")
-	}
-	if len(dst) != PRIVATE_KEY_SIZE {
-		panic("crypto/ed25519: invalid destination size")
-	}
+	ensure(priv_key._is_initialized, "crypto/ed25519: uninitialized private key")
+	ensure(len(dst) == PRIVATE_KEY_SIZE, "crypto/ed25519: invalid destination size")

 	copy(dst, priv_key._b[:])
 }
@@ -98,12 +94,8 @@ private_key_clear :: proc "contextless" (priv_key: ^Private_Key) {

 // sign writes the signature by priv_key over msg to sig.
 sign :: proc(priv_key: ^Private_Key, msg, sig: []byte) {
-	if !priv_key._is_initialized {
-		panic("crypto/ed25519: uninitialized private key")
-	}
-	if len(sig) != SIGNATURE_SIZE {
-		panic("crypto/ed25519: invalid destination size")
-	}
+	ensure(priv_key._is_initialized, "crypto/ed25519: uninitialized private key")
+	ensure(len(sig) == SIGNATURE_SIZE, "crypto/ed25519: invalid destination size")

 	// 1. Compute the hash of the private key d, H(d) = (h_0, h_1, ..., h_2b-1)
 	// using SHA-512 for Ed25519.  H(d) may be precomputed.
@@ -178,9 +170,7 @@ public_key_set_bytes :: proc "contextless" (pub_key: ^Public_Key, b: []byte) ->

 // public_key_set_priv sets pub_key to the public component of priv_key.
 public_key_set_priv :: proc(pub_key: ^Public_Key, priv_key: ^Private_Key) {
-	if !priv_key._is_initialized {
-		panic("crypto/ed25519: uninitialized public key")
-	}
+	ensure(priv_key._is_initialized, "crypto/ed25519: uninitialized public key")

 	src := &priv_key._pub_key
 	copy(pub_key._b[:], src._b[:])
@@ -191,21 +181,15 @@ public_key_set_priv :: proc(pub_key: ^Public_Key, priv_key: ^Private_Key) {

 // public_key_bytes sets dst to byte-encoding of pub_key.
 public_key_bytes :: proc(pub_key: ^Public_Key, dst: []byte) {
-	if !pub_key._is_initialized {
-		panic("crypto/ed25519: uninitialized public key")
-	}
-	if len(dst) != PUBLIC_KEY_SIZE {
-		panic("crypto/ed25519: invalid destination size")
-	}
+	ensure(pub_key._is_initialized, "crypto/ed25519: uninitialized public key")
+	ensure(len(dst) == PUBLIC_KEY_SIZE, "crypto/ed25519: invalid destination size")

 	copy(dst, pub_key._b[:])
 }

 // public_key_equal returns true iff pub_key is equal to other.
 public_key_equal :: proc(pub_key, other: ^Public_Key) -> bool {
-	if !pub_key._is_initialized || !other._is_initialized {
-		panic("crypto/ed25519: uninitialized public key")
-	}
+	ensure(pub_key._is_initialized && other._is_initialized, "crypto/ed25519: uninitialized public key")

 	return crypto.compare_constant_time(pub_key._b[:], other._b[:]) == 1
 }
@@ -56,7 +56,7 @@ init :: proc(ctx: ^Context, algorithm: hash.Algorithm, key: []byte) {

 // update adds more data to the Context.
 update :: proc(ctx: ^Context, data: []byte) {
-	assert(ctx._is_initialized)
+	ensure(ctx._is_initialized)

 	hash.update(&ctx._i_hash, data)
 }
@@ -64,13 +64,10 @@ update :: proc(ctx: ^Context, data: []byte) {
 // final finalizes the Context, writes the tag to dst, and calls
 // reset on the Context.
 final :: proc(ctx: ^Context, dst: []byte) {
-	assert(ctx._is_initialized)
-
 	defer (reset(ctx))

-	if len(dst) != ctx._tag_sz {
-		panic("crypto/hmac: invalid destination tag size")
-	}
+	ensure(ctx._is_initialized)
+	ensure(len(dst) == ctx._tag_sz, "crypto/hmac: invalid destination tag size")

 	hash.final(&ctx._i_hash, dst) // H((k ^ ipad) || text)

@@ -105,14 +102,14 @@ reset :: proc(ctx: ^Context) {

 // algorithm returns the Algorithm used by a Context instance.
 algorithm :: proc(ctx: ^Context) -> hash.Algorithm {
-	assert(ctx._is_initialized)
+	ensure(ctx._is_initialized)

 	return hash.algorithm(&ctx._i_hash)
 }

 // tag_size returns the tag size of a Context instance in bytes.
 tag_size :: proc(ctx: ^Context) -> int {
-	assert(ctx._is_initialized)
+	ensure(ctx._is_initialized)

 	return ctx._tag_sz
 }
@@ -36,6 +36,7 @@ sum :: proc(sec_strength: int, dst, msg, key, domain_sep: []byte) {
 // tag is valid.
 verify :: proc(sec_strength: int, tag, msg, key, domain_sep: []byte, allocator := context.temp_allocator) -> bool {
 	derived_tag := make([]byte, len(tag), allocator)
+	defer(delete(derived_tag))

 	sum(sec_strength, derived_tag, msg, key, domain_sep)

@@ -59,8 +60,6 @@ init_256 :: proc(ctx: ^Context, key, domain_sep: []byte) {

 // update adds more data to the Context.
 update :: proc(ctx: ^Context, data: []byte) {
-	assert(ctx.is_initialized)
-
 	shake.write((^shake.Context)(ctx), data)
 }

@@ -68,12 +67,9 @@ update :: proc(ctx: ^Context, data: []byte) {
 // on the Context.  This routine will panic if the dst length is less than
 // MIN_TAG_SIZE.
 final :: proc(ctx: ^Context, dst: []byte) {
-	assert(ctx.is_initialized)
 	defer reset(ctx)

-	if len(dst) < MIN_TAG_SIZE {
-		panic("crypto/kmac: invalid KMAC tag_size, too short")
-	}
+	ensure(len(dst) >= MIN_TAG_SIZE, "crypto/kmac: invalid KMAC tag_size, too short")

 	_sha3.final_cshake((^_sha3.Context)(ctx), dst)
 }
@@ -103,14 +99,12 @@ _init_kmac :: proc(ctx: ^Context, key, s: []byte, sec_strength: int) {
 		reset(ctx)
 	}

-	if len(key) < sec_strength / 8 {
-		panic("crypto/kmac: invalid KMAC key, too short")
-	}
+	ensure(len(key) >= sec_strength / 8, "crypto/kmac: invalid KMAC key, too short")

 	ctx_ := (^_sha3.Context)(ctx)
 	_sha3.init_cshake(ctx_, N_KMAC, s, sec_strength)
 	_sha3.bytepad(ctx_, [][]byte{key}, _sha3.rate_cshake(sec_strength))
 }

-@(private)
+@(private, rodata)
 N_KMAC := []byte{'K', 'M', 'A', 'C'}
@@ -40,37 +40,37 @@ BLOCK_SIZE_512 :: _sha3.RATE_512
 Context :: distinct _sha3.Context

 // init_224 initializes a Context for Keccak-224.
-init_224 :: proc(ctx: ^Context) {
+init_224 :: proc "contextless" (ctx: ^Context) {
 	ctx.mdlen = DIGEST_SIZE_224
 	_init(ctx)
 }

 // init_256 initializes a Context for Keccak-256.
-init_256 :: proc(ctx: ^Context) {
+init_256 :: proc "contextless" (ctx: ^Context) {
 	ctx.mdlen = DIGEST_SIZE_256
 	_init(ctx)
 }

 // init_384 initializes a Context for Keccak-384.
-init_384 :: proc(ctx: ^Context) {
+init_384 :: proc "contextless" (ctx: ^Context) {
 	ctx.mdlen = DIGEST_SIZE_384
 	_init(ctx)
 }

 // init_512 initializes a Context for Keccak-512.
-init_512 :: proc(ctx: ^Context) {
+init_512 :: proc "contextless" (ctx: ^Context) {
 	ctx.mdlen = DIGEST_SIZE_512
 	_init(ctx)
 }

@(private)
-_init :: proc(ctx: ^Context) {
+_init :: proc "contextless" (ctx: ^Context) {
 	ctx.dsbyte = _sha3.DS_KECCAK
 	_sha3.init((^_sha3.Context)(ctx))
 }

 // update adds more data to the Context.
-update :: proc(ctx: ^Context, data: []byte) {
+update :: proc "contextless" (ctx: ^Context, data: []byte) {
 	_sha3.update((^_sha3.Context)(ctx), data)
 }

@@ -79,17 +79,17 @@ update :: proc(ctx: ^Context, data: []byte) {
 //
 // Iff finalize_clone is set, final will work on a copy of the Context,
 // which is useful for for calculating rolling digests.
-final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
+final :: proc "contextless" (ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
 	_sha3.final((^_sha3.Context)(ctx), hash, finalize_clone)
 }

 // clone clones the Context other into ctx.
-clone :: proc(ctx, other: ^Context) {
+clone :: proc "contextless" (ctx, other: ^Context) {
 	_sha3.clone((^_sha3.Context)(ctx), (^_sha3.Context)(other))
 }

 // reset sanitizes the Context.  The Context must be re-initialized to
 // be used again.
-reset :: proc(ctx: ^Context) {
+reset :: proc "contextless" (ctx: ^Context) {
 	_sha3.reset((^_sha3.Context)(ctx))
 }
@@ -53,7 +53,7 @@ init :: proc(ctx: ^Context) {

 // update adds more data to the Context.
 update :: proc(ctx: ^Context, data: []byte) {
-	assert(ctx.is_initialized)
+	ensure(ctx.is_initialized)

 	for i := 0; i < len(data); i += 1 {
 		ctx.data[ctx.datalen] = data[i]
@@ -72,11 +72,8 @@ update :: proc(ctx: ^Context, data: []byte) {
 // Iff finalize_clone is set, final will work on a copy of the Context,
 // which is useful for for calculating rolling digests.
 final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
-	assert(ctx.is_initialized)
-
-	if len(hash) < DIGEST_SIZE {
-		panic("crypto/md5: invalid destination digest size")
-	}
+	ensure(ctx.is_initialized)
+	ensure(len(hash) >= DIGEST_SIZE, "crypto/md5: invalid destination digest size")

 	ctx := ctx
 	if finalize_clone {
@@ -60,7 +60,7 @@ init :: proc(ctx: ^Context) {

 // update adds more data to the Context.
 update :: proc(ctx: ^Context, data: []byte) {
-	assert(ctx.is_initialized)
+	ensure(ctx.is_initialized)

 	for i := 0; i < len(data); i += 1 {
 		ctx.data[ctx.datalen] = data[i]
@@ -79,11 +79,8 @@ update :: proc(ctx: ^Context, data: []byte) {
 // Iff finalize_clone is set, final will work on a copy of the Context,
 // which is useful for for calculating rolling digests.
 final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
-	assert(ctx.is_initialized)
-
-	if len(hash) < DIGEST_SIZE {
-		panic("crypto/sha1: invalid destination digest size")
-	}
+	ensure(ctx.is_initialized)
+	ensure(len(hash) >= DIGEST_SIZE, "crypto/sha1: invalid destination digest size")

 	ctx := ctx
 	if finalize_clone {
@@ -60,9 +60,7 @@ Context :: struct {
 // init initializes a Context with the specified key.  The key SHOULD be
 // unique and MUST be unpredictable for each invocation.
 init :: proc(ctx: ^Context, key: []byte) {
-	if len(key) != KEY_SIZE {
-		panic("crypto/poly1305: invalid key size")
-	}
+	ensure(len(key) == KEY_SIZE, "crypto/poly1305: invalid key size")

 	// r = le_bytes_to_num(key[0..15])
 	// r = clamp(r) (r &= 0xffffffc0ffffffc0ffffffc0fffffff)
@@ -85,7 +83,7 @@ init :: proc(ctx: ^Context, key: []byte) {

 // update adds more data to the Context.
 update :: proc(ctx: ^Context, data: []byte) {
-	assert(ctx._is_initialized)
+	ensure(ctx._is_initialized)

 	msg := data
 	msg_len := len(data)
@@ -124,12 +122,10 @@ update :: proc(ctx: ^Context, data: []byte) {
 // final finalizes the Context, writes the tag to dst, and calls
 // reset on the Context.
 final :: proc(ctx: ^Context, dst: []byte) {
-	assert(ctx._is_initialized)
 	defer reset(ctx)

-	if len(dst) != TAG_SIZE {
-		panic("poly1305: invalid destination tag size")
-	}
+	ensure(ctx._is_initialized)
+	ensure(len(dst) == TAG_SIZE, "poly1305: invalid destination tag size")

 	// Process remaining block
 	if ctx._leftover > 0 {
@@ -16,7 +16,7 @@ ELEMENT_SIZE :: 32
 // group element.
 WIDE_ELEMENT_SIZE :: 64

-@(private)
+@(private, rodata)
 FE_NEG_ONE := field.Tight_Field_Element {
 	2251799813685228,
 	2251799813685247,
@@ -24,7 +24,7 @@ FE_NEG_ONE := field.Tight_Field_Element {
 	2251799813685247,
 	2251799813685247,
 }
-@(private)
+@(private, rodata)
 FE_INVSQRT_A_MINUS_D := field.Tight_Field_Element {
 	278908739862762,
 	821645201101625,
@@ -32,7 +32,7 @@ FE_INVSQRT_A_MINUS_D := field.Tight_Field_Element {
 	1777959178193151,
 	2118520810568447,
 }
-@(private)
+@(private, rodata)
 FE_ONE_MINUS_D_SQ := field.Tight_Field_Element {
 	1136626929484150,
 	1998550399581263,
@@ -40,7 +40,7 @@ FE_ONE_MINUS_D_SQ := field.Tight_Field_Element {
 	118527312129759,
 	45110755273534,
 }
-@(private)
+@(private, rodata)
 FE_D_MINUS_ONE_SQUARED := field.Tight_Field_Element {
 	1507062230895904,
 	1572317787530805,
@@ -48,7 +48,7 @@ FE_D_MINUS_ONE_SQUARED := field.Tight_Field_Element {
 	317374165784489,
 	1572899562415810,
 }
-@(private)
+@(private, rodata)
 FE_SQRT_AD_MINUS_ONE := field.Tight_Field_Element {
 	2241493124984347,
 	425987919032274,
@@ -76,7 +76,7 @@ ge_clear :: proc "contextless" (ge: ^Group_Element) {

 // ge_set sets `ge = a`.
 ge_set :: proc(ge, a: ^Group_Element) {
-	_ge_assert_initialized([]^Group_Element{a})
+	_ge_ensure_initialized([]^Group_Element{a})

 	grp.ge_set(&ge._p, &a._p)
 	ge._is_initialized = true
@@ -199,9 +199,7 @@ ge_set_bytes :: proc "contextless" (ge: ^Group_Element, b: []byte) -> bool {
 // ge_set_wide_bytes sets ge to the result of deriving a ristretto255
 // group element, from a wide (512-bit) byte string.
 ge_set_wide_bytes :: proc(ge: ^Group_Element, b: []byte) {
-	if len(b) != WIDE_ELEMENT_SIZE {
-		panic("crypto/ristretto255: invalid wide input size")
-	}
+	ensure(len(b) == WIDE_ELEMENT_SIZE, "crypto/ristretto255: invalid wide input size")

 	// The element derivation function on an input string b proceeds as
 	// follows:
@@ -222,10 +220,8 @@ ge_set_wide_bytes :: proc(ge: ^Group_Element, b: []byte) {

 // ge_bytes sets dst to the canonical encoding of ge.
 ge_bytes :: proc(ge: ^Group_Element, dst: []byte) {
-	_ge_assert_initialized([]^Group_Element{ge})
-	if len(dst) != ELEMENT_SIZE {
-		panic("crypto/ristretto255: invalid destination size")
-	}
+	_ge_ensure_initialized([]^Group_Element{ge})
+	ensure(len(dst) == ELEMENT_SIZE, "crypto/ristretto255: invalid destination size")

 	x0, y0, z0, t0 := &ge._p.x, &ge._p.y, &ge._p.z, &ge._p.t

@@ -306,7 +302,7 @@ ge_bytes :: proc(ge: ^Group_Element, dst: []byte) {

 // ge_add sets `ge = a + b`.
 ge_add :: proc(ge, a, b: ^Group_Element) {
-	_ge_assert_initialized([]^Group_Element{a, b})
+	_ge_ensure_initialized([]^Group_Element{a, b})

 	grp.ge_add(&ge._p, &a._p, &b._p)
 	ge._is_initialized = true
@@ -314,7 +310,7 @@ ge_add :: proc(ge, a, b: ^Group_Element) {

 // ge_double sets `ge = a + a`.
 ge_double :: proc(ge, a: ^Group_Element) {
-	_ge_assert_initialized([]^Group_Element{a})
+	_ge_ensure_initialized([]^Group_Element{a})

 	grp.ge_double(&ge._p, &a._p)
 	ge._is_initialized = true
@@ -322,7 +318,7 @@ ge_double :: proc(ge, a: ^Group_Element) {

 // ge_negate sets `ge = -a`.
 ge_negate :: proc(ge, a: ^Group_Element) {
-	_ge_assert_initialized([]^Group_Element{a})
+	_ge_ensure_initialized([]^Group_Element{a})

 	grp.ge_negate(&ge._p, &a._p)
 	ge._is_initialized = true
@@ -330,7 +326,7 @@ ge_negate :: proc(ge, a: ^Group_Element) {

 // ge_scalarmult sets `ge = A * sc`.
 ge_scalarmult :: proc(ge, A: ^Group_Element, sc: ^Scalar) {
-	_ge_assert_initialized([]^Group_Element{A})
+	_ge_ensure_initialized([]^Group_Element{A})

 	grp.ge_scalarmult(&ge._p, &A._p, sc)
 	ge._is_initialized = true
@@ -344,7 +340,7 @@ ge_scalarmult_generator :: proc "contextless" (ge: ^Group_Element, sc: ^Scalar)

 // ge_scalarmult_vartime sets `ge = A * sc` in variable time.
 ge_scalarmult_vartime :: proc(ge, A: ^Group_Element, sc: ^Scalar) {
-	_ge_assert_initialized([]^Group_Element{A})
+	_ge_ensure_initialized([]^Group_Element{A})

 	grp.ge_scalarmult_vartime(&ge._p, &A._p, sc)
 	ge._is_initialized = true
@@ -358,7 +354,7 @@ ge_double_scalarmult_generator_vartime :: proc(
 	A: ^Group_Element,
 	b: ^Scalar,
 ) {
-	_ge_assert_initialized([]^Group_Element{A})
+	_ge_ensure_initialized([]^Group_Element{A})

 	grp.ge_double_scalarmult_basepoint_vartime(&ge._p, a, &A._p, b)
 	ge._is_initialized = true
@@ -367,7 +363,7 @@ ge_double_scalarmult_generator_vartime :: proc(
 // ge_cond_negate sets `ge = a` iff `ctrl == 0` and `ge = -a` iff `ctrl == 1`.
 // Behavior for all other values of ctrl are undefined,
 ge_cond_negate :: proc(ge, a: ^Group_Element, ctrl: int) {
-	_ge_assert_initialized([]^Group_Element{a})
+	_ge_ensure_initialized([]^Group_Element{a})

 	grp.ge_cond_negate(&ge._p, &a._p, ctrl)
 	ge._is_initialized = true
@@ -376,7 +372,7 @@ ge_cond_negate :: proc(ge, a: ^Group_Element, ctrl: int) {
 // ge_cond_assign sets `ge = ge` iff `ctrl == 0` and `ge = a` iff `ctrl == 1`.
 // Behavior for all other values of ctrl are undefined,
 ge_cond_assign :: proc(ge, a: ^Group_Element, ctrl: int) {
-	_ge_assert_initialized([]^Group_Element{ge, a})
+	_ge_ensure_initialized([]^Group_Element{ge, a})

 	grp.ge_cond_assign(&ge._p, &a._p, ctrl)
 }
@@ -384,7 +380,7 @@ ge_cond_assign :: proc(ge, a: ^Group_Element, ctrl: int) {
 // ge_cond_select sets `ge = a` iff `ctrl == 0` and `ge = b` iff `ctrl == 1`.
 // Behavior for all other values of ctrl are undefined,
 ge_cond_select :: proc(ge, a, b: ^Group_Element, ctrl: int) {
-	_ge_assert_initialized([]^Group_Element{a, b})
+	_ge_ensure_initialized([]^Group_Element{a, b})

 	grp.ge_cond_select(&ge._p, &a._p, &b._p, ctrl)
 	ge._is_initialized = true
@@ -393,7 +389,7 @@ ge_cond_select :: proc(ge, a, b: ^Group_Element, ctrl: int) {
 // ge_equal returns 1 iff `a == b`, and 0 otherwise.
@(require_results)
 ge_equal :: proc(a, b: ^Group_Element) -> int {
-	_ge_assert_initialized([]^Group_Element{a, b})
+	_ge_ensure_initialized([]^Group_Element{a, b})

 	// CT_EQ(x1 * y2, y1 * x2) | CT_EQ(y1 * y2, x1 * x2)
 	ax_by, ay_bx, ay_by, ax_bx: field.Tight_Field_Element = ---, ---, ---, ---
@@ -501,10 +497,8 @@ ge_map :: proc "contextless" (ge: ^Group_Element, b: []byte) {
 }

@(private)
-_ge_assert_initialized :: proc(ges: []^Group_Element) {
+_ge_ensure_initialized :: proc(ges: []^Group_Element) {
 	for ge in ges {
-		if !ge._is_initialized {
-			panic("crypto/ristretto255: uninitialized group element")
-		}
+		ensure(ge._is_initialized, "crypto/ristretto255: uninitialized group element")
 	}
 }
@@ -42,9 +42,7 @@ sc_set_bytes :: proc(sc: ^Scalar, b: []byte) -> bool {
 // scalar, from a wide (512-bit) byte string by interpreting b as a
 // little-endian value, and reducing it mod the group order.
 sc_set_bytes_wide :: proc(sc: ^Scalar, b: []byte) {
-	if len(b) != WIDE_SCALAR_SIZE {
-		panic("crypto/ristretto255: invalid wide input size")
-	}
+	ensure(len(b) == WIDE_SCALAR_SIZE, "crypto/ristretto255: invalid wide input size")

 	b_ := (^[WIDE_SCALAR_SIZE]byte)(raw_data(b))
 	grp.sc_set_bytes_wide(sc, b_)
@@ -52,9 +50,7 @@ sc_set_bytes_wide :: proc(sc: ^Scalar, b: []byte) {

 // sc_bytes sets dst to the canonical encoding of sc.
 sc_bytes :: proc(sc: ^Scalar, dst: []byte) {
-	if len(dst) != SCALAR_SIZE {
-		panic("crypto/ristretto255: invalid destination size")
-	}
+	ensure(len(dst) == SCALAR_SIZE, "crypto/ristretto255: invalid destination size")

 	grp.sc_bytes(dst, sc)
 }
@@ -15,9 +15,9 @@ package sha2
        zhibog, dotbmp:  Initial implementation.
 */

-import "core:encoding/endian"
+@(require) import "core:encoding/endian"
 import "core:math/bits"
-import "core:mem"
+@(require) import "core:mem"

 // DIGEST_SIZE_224 is the SHA-224 digest size in bytes.
 DIGEST_SIZE_224 :: 28
@@ -158,7 +158,7 @@ _init :: proc(ctx: ^$T) {

 // update adds more data to the Context.
 update :: proc(ctx: ^$T, data: []byte) {
-	assert(ctx.is_initialized)
+	ensure(ctx.is_initialized)

 	when T == Context_256 {
 		CURR_BLOCK_SIZE :: BLOCK_SIZE_256
@@ -194,11 +194,8 @@ update :: proc(ctx: ^$T, data: []byte) {
 // Iff finalize_clone is set, final will work on a copy of the Context,
 // which is useful for for calculating rolling digests.
 final :: proc(ctx: ^$T, hash: []byte, finalize_clone: bool = false) {
-	assert(ctx.is_initialized)
-
-	if len(hash) * 8 < ctx.md_bits {
-		panic("crypto/sha2: invalid destination digest size")
-	}
+	ensure(ctx.is_initialized)
+	ensure(len(hash) * 8 >= ctx.md_bits, "crypto/sha2: invalid destination digest size")

 	ctx := ctx
 	if finalize_clone {
@@ -238,7 +235,7 @@ final :: proc(ctx: ^$T, hash: []byte, finalize_clone: bool = false) {
 		endian.unchecked_put_u64be(pad[8:], length_lo)
 		update(ctx, pad[0:16])
 	}
-	assert(ctx.bitlength == 0)
+	assert(ctx.bitlength == 0) // Check for bugs

 	when T == Context_256 {
 		for i := 0; i < ctx.md_bits / 32; i += 1 {
@@ -270,8 +267,8 @@ reset :: proc(ctx: ^$T) {
    SHA2 implementation
 */

-@(private)
-sha256_k := [64]u32 {
+@(private, rodata)
+SHA256_K := [64]u32 {
 	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@@ -290,8 +287,8 @@ sha256_k := [64]u32 {
 	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 }

-@(private)
-sha512_k := [80]u64 {
+@(private, rodata)
+SHA512_K := [80]u64 {
 	0x428a2f98d728ae22, 0x7137449123ef65cd,
 	0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
 	0x3956c25bf348b538, 0x59f111f1b605d019,
@@ -334,6 +331,11 @@ sha512_k := [80]u64 {
 	0x5fcb6fab3ad6faec, 0x6c44198c4a475817,
 }

+@(private)
+SHA256_ROUNDS :: 64
+@(private)
+SHA512_ROUNDS :: 80
+
@(private)
 SHA256_CH :: #force_inline proc "contextless" (x, y, z: u32) -> u32 {
 	return (x & y) ~ (~x & z)
@@ -395,22 +397,29 @@ SHA512_F4 :: #force_inline proc "contextless" (x: u64) -> u64 {
 }

@(private)
-sha2_transf :: proc "contextless" (ctx: ^$T, data: []byte) {
+sha2_transf :: proc "contextless" (ctx: ^$T, data: []byte) #no_bounds_check {
 	when T == Context_256 {
-		w: [64]u32
+		if is_hardware_accelerated_256() {
+			sha256_transf_hw(ctx, data)
+			return
+		}
+
+		w: [SHA256_ROUNDS]u32
 		wv: [8]u32
 		t1, t2: u32
+
 		CURR_BLOCK_SIZE :: BLOCK_SIZE_256
 	} else when T == Context_512 {
-		w: [80]u64
+		w: [SHA512_ROUNDS]u64
 		wv: [8]u64
 		t1, t2: u64
+
 		CURR_BLOCK_SIZE :: BLOCK_SIZE_512
 	}

 	data := data
 	for len(data) >= CURR_BLOCK_SIZE {
-		for i := 0; i < 16; i += 1 {
+		for i in 0 ..< 16 {
 			when T == Context_256 {
 				w[i] = endian.unchecked_get_u32be(data[i * 4:])
 			} else when T == Context_512 {
@@ -419,22 +428,22 @@ sha2_transf :: proc "contextless" (ctx: ^$T, data: []byte) {
 		}

 		when T == Context_256 {
-			for i := 16; i < 64; i += 1 {
+			for i in 16 ..< SHA256_ROUNDS {
 				w[i] = SHA256_F4(w[i - 2]) + w[i - 7] + SHA256_F3(w[i - 15]) + w[i - 16]
 			}
 		} else when T == Context_512 {
-			for i := 16; i < 80; i += 1 {
+			for i in 16 ..< SHA512_ROUNDS {
 				w[i] = SHA512_F4(w[i - 2]) + w[i - 7] + SHA512_F3(w[i - 15]) + w[i - 16]
 			}
 		}

-		for i := 0; i < 8; i += 1 {
+		for i in 0 ..< 8 {
 			wv[i] = ctx.h[i]
 		}

 		when T == Context_256 {
-			for i := 0; i < 64; i += 1 {
-				t1 = wv[7] + SHA256_F2(wv[4]) + SHA256_CH(wv[4], wv[5], wv[6]) + sha256_k[i] + w[i]
+			for i in 0 ..< SHA256_ROUNDS {
+				t1 = wv[7] + SHA256_F2(wv[4]) + SHA256_CH(wv[4], wv[5], wv[6]) + SHA256_K[i] + w[i]
 				t2 = SHA256_F1(wv[0]) + SHA256_MAJ(wv[0], wv[1], wv[2])
 				wv[7] = wv[6]
 				wv[6] = wv[5]
@@ -446,8 +455,8 @@ sha2_transf :: proc "contextless" (ctx: ^$T, data: []byte) {
 				wv[0] = t1 + t2
 			}
 		} else when T == Context_512 {
-			for i := 0; i < 80; i += 1 {
-				t1 = wv[7] + SHA512_F2(wv[4]) + SHA512_CH(wv[4], wv[5], wv[6]) + sha512_k[i] + w[i]
+			for i in 0 ..< SHA512_ROUNDS {
+				t1 = wv[7] + SHA512_F2(wv[4]) + SHA512_CH(wv[4], wv[5], wv[6]) + SHA512_K[i] + w[i]
 				t2 = SHA512_F1(wv[0]) + SHA512_MAJ(wv[0], wv[1], wv[2])
 				wv[7] = wv[6]
 				wv[6] = wv[5]
@@ -460,7 +469,7 @@ sha2_transf :: proc "contextless" (ctx: ^$T, data: []byte) {
 			}
 		}

-		for i := 0; i < 8; i += 1 {
+		for i in 0 ..< 8 {
 			ctx.h[i] += wv[i]
 		}

@@ -0,0 +1,15 @@
+#+build !amd64
+package sha2
+
+@(private = "file")
+ERR_HW_NOT_SUPPORTED :: "crypto/sha2: hardware implementation unsupported"
+
+// is_hardware_accelerated_256 returns true iff hardware accelerated
+// SHA-224/SHA-256 is supported.
+is_hardware_accelerated_256 :: proc "contextless" () -> bool {
+	return false
+}
+
+sha256_transf_hw :: proc "contextless" (ctx: ^Context_256, data: []byte) {
+	panic_contextless(ERR_HW_NOT_SUPPORTED)
+}
@@ -0,0 +1,260 @@
+#+build amd64
+package sha2
+
+// Based on the public domain code by Jeffrey Walton, though
+// realistically, there only is one sensible way to write this
+// and Intel's whitepaper covers it.
+//
+// See: https://github.com/noloader/SHA-Intrinsics
+
+import "base:intrinsics"
+import "core:simd"
+import "core:simd/x86"
+import "core:sys/info"
+
+@(private = "file")
+MASK :: x86.__m128i{0x0405060700010203, 0x0c0d0e0f08090a0b}
+
+@(private = "file")
+K_0 :: simd.u64x2{0x71374491428a2f98, 0xe9b5dba5b5c0fbcf}
+@(private = "file")
+K_1 :: simd.u64x2{0x59f111f13956c25b, 0xab1c5ed5923f82a4}
+@(private = "file")
+K_2 :: simd.u64x2{0x12835b01d807aa98, 0x550c7dc3243185be}
+@(private = "file")
+K_3 :: simd.u64x2{0x80deb1fe72be5d74, 0xc19bf1749bdc06a7}
+@(private = "file")
+K_4 :: simd.u64x2{0xefbe4786e49b69c1, 0x240ca1cc0fc19dc6}
+@(private = "file")
+K_5 :: simd.u64x2{0x4a7484aa2de92c6f, 0x76f988da5cb0a9dc}
+@(private = "file")
+K_6 :: simd.u64x2{0xa831c66d983e5152, 0xbf597fc7b00327c8}
+@(private = "file")
+K_7 :: simd.u64x2{0xd5a79147c6e00bf3, 0x1429296706ca6351}
+@(private = "file")
+K_8 :: simd.u64x2{0x2e1b213827b70a85, 0x53380d134d2c6dfc}
+@(private = "file")
+K_9 :: simd.u64x2{0x766a0abb650a7354, 0x92722c8581c2c92e}
+@(private = "file")
+K_10 :: simd.u64x2{0xa81a664ba2bfe8a1, 0xc76c51a3c24b8b70}
+@(private = "file")
+K_11 :: simd.u64x2{0xd6990624d192e819, 0x106aa070f40e3585}
+@(private = "file")
+K_12 :: simd.u64x2{0x1e376c0819a4c116, 0x34b0bcb52748774c}
+@(private = "file")
+K_13 :: simd.u64x2{0x4ed8aa4a391c0cb3, 0x682e6ff35b9cca4f}
+@(private = "file")
+K_14 :: simd.u64x2{0x78a5636f748f82ee, 0x8cc7020884c87814}
+@(private = "file")
+K_15 :: simd.u64x2{0xa4506ceb90befffa, 0xc67178f2bef9a3f7}
+
+
+// is_hardware_accelerated_256 returns true iff hardware accelerated
+// SHA-224/SHA-256 is supported.
+is_hardware_accelerated_256 :: proc "contextless" () -> bool {
+	features, ok := info.cpu_features.?
+	if !ok {
+		return false
+	}
+
+	req_features :: info.CPU_Features{
+		.sse2,
+		.ssse3,
+		.sse41,
+		.sha,
+	}
+	return features >= req_features
+}
+
+@(private, enable_target_feature="sse2,ssse3,sse4.1,sha")
+sha256_transf_hw :: proc "contextless" (ctx: ^Context_256, data: []byte) #no_bounds_check {
+	// Load the state
+	tmp := intrinsics.unaligned_load((^x86.__m128i)(&ctx.h[0]))
+	state_1 := intrinsics.unaligned_load((^x86.__m128i)(&ctx.h[4]))
+
+	tmp = x86._mm_shuffle_epi32(tmp, 0xb1)            // CDAB
+	state_1 = x86._mm_shuffle_epi32(state_1, 0x1b)    // EFGH
+	state_0 := x86._mm_alignr_epi8(tmp, state_1, 8)   // ABEF
+	// state_1 = x86._mm_blend_epi16(state_1, tmp, 0xf0) // CDGH
+	state_1 = kludge_mm_blend_epi16_0xf0(state_1, tmp)
+
+	data := data
+	for len(data) >= BLOCK_SIZE_256 {
+		state_0_save, state_1_save := state_0, state_1
+
+		// Rounds 0-3
+		msg := intrinsics.unaligned_load((^x86.__m128i)(raw_data(data)))
+		msg_0 := x86._mm_shuffle_epi8(msg, MASK)
+		msg = x86._mm_add_epi32(msg_0, x86.__m128i(K_0))
+		state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
+		msg = x86._mm_shuffle_epi32(msg, 0xe)
+		state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
+
+		// Rounds 4-7
+		msg_1 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(data[16:])))
+		msg_1 = x86._mm_shuffle_epi8(msg_1, MASK)
+		msg = x86._mm_add_epi32(msg_1, x86.__m128i(K_1))
+		state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
+		msg = x86._mm_shuffle_epi32(msg, 0xe)
+		state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
+		msg_0 = x86._mm_sha256msg1_epu32(msg_0, msg_1)
+
+		// Rounds 8-11
+		msg_2 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(data[32:])))
+		msg_2 = x86._mm_shuffle_epi8(msg_2, MASK)
+		msg = x86._mm_add_epi32(msg_2, x86.__m128i(K_2))
+		state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
+		msg = x86._mm_shuffle_epi32(msg, 0xe)
+		state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
+		msg_1 = x86._mm_sha256msg1_epu32(msg_1, msg_2)
+
+		// Rounds 12-15
+		msg_3 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(data[48:])))
+		msg_3 = x86._mm_shuffle_epi8(msg_3, MASK)
+		msg = x86._mm_add_epi32(msg_3, x86.__m128i(K_3))
+		state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
+		tmp = x86._mm_alignr_epi8(msg_3, msg_2, 4)
+		msg_0 = x86._mm_add_epi32(msg_0, tmp)
+		msg_0 = x86._mm_sha256msg2_epu32(msg_0, msg_3)
+		msg = x86._mm_shuffle_epi32(msg, 0x0e)
+		state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
+		msg_2 = x86._mm_sha256msg1_epu32(msg_2, msg_3)
+
+		// Rounds 16-19
+		msg = x86._mm_add_epi32(msg_0, x86.__m128i(K_4))
+		state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
+		tmp = x86._mm_alignr_epi8(msg_0, msg_3, 4)
+		msg_1 = x86._mm_add_epi32(msg_1, tmp)
+		msg_1 = x86._mm_sha256msg2_epu32(msg_1, msg_0)
+		msg = x86._mm_shuffle_epi32(msg, 0x0e)
+		state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
+		msg_3 = x86._mm_sha256msg1_epu32(msg_3, msg_0)
+
+		// Rounds 20-23
+		msg = x86._mm_add_epi32(msg_1, x86.__m128i(K_5))
+		state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
+		tmp = x86._mm_alignr_epi8(msg_1, msg_0, 4)
+		msg_2 = x86._mm_add_epi32(msg_2, tmp)
+		msg_2 = x86._mm_sha256msg2_epu32(msg_2, msg_1)
+		msg = x86._mm_shuffle_epi32(msg, 0x0e)
+		state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
+		msg_0 = x86._mm_sha256msg1_epu32(msg_0, msg_1)
+
+		// Rounds 24-27
+		msg = x86._mm_add_epi32(msg_2, x86.__m128i(K_6))
+		state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
+		tmp = x86._mm_alignr_epi8(msg_2, msg_1, 4)
+		msg_3 = x86._mm_add_epi32(msg_3, tmp)
+		msg_3 = x86._mm_sha256msg2_epu32(msg_3, msg_2)
+		msg = x86._mm_shuffle_epi32(msg, 0x0e)
+		state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
+		msg_1 = x86._mm_sha256msg1_epu32(msg_1, msg_2)
+
+		// Rounds 28-31
+		msg = x86._mm_add_epi32(msg_3, x86.__m128i(K_7))
+		state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
+		tmp = x86._mm_alignr_epi8(msg_3, msg_2, 4)
+		msg_0 = x86._mm_add_epi32(msg_0, tmp)
+		msg_0 = x86._mm_sha256msg2_epu32(msg_0, msg_3)
+		msg = x86._mm_shuffle_epi32(msg, 0x0e)
+		state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
+		msg_2 = x86._mm_sha256msg1_epu32(msg_2, msg_3)
+
+		// Rounds 32-35
+		msg = x86._mm_add_epi32(msg_0, x86.__m128i(K_8))
+		state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
+		tmp = x86._mm_alignr_epi8(msg_0, msg_3, 4)
+		msg_1 = x86._mm_add_epi32(msg_1, tmp)
+		msg_1 = x86._mm_sha256msg2_epu32(msg_1, msg_0)
+		msg = x86._mm_shuffle_epi32(msg, 0x0e)
+		state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
+		msg_3 = x86._mm_sha256msg1_epu32(msg_3, msg_0)
+
+		// Rounds 36-39
+		msg = x86._mm_add_epi32(msg_1, x86.__m128i(K_9))
+		state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
+		tmp = x86._mm_alignr_epi8(msg_1, msg_0, 4)
+		msg_2 = x86._mm_add_epi32(msg_2, tmp)
+		msg_2 = x86._mm_sha256msg2_epu32(msg_2, msg_1)
+		msg = x86._mm_shuffle_epi32(msg, 0x0e)
+		state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
+		msg_0 = x86._mm_sha256msg1_epu32(msg_0, msg_1)
+
+		// Rounds 40-43
+		msg = x86._mm_add_epi32(msg_2, x86.__m128i(K_10))
+		state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
+		tmp = x86._mm_alignr_epi8(msg_2, msg_1, 4)
+		msg_3 = x86._mm_add_epi32(msg_3, tmp)
+		msg_3 = x86._mm_sha256msg2_epu32(msg_3, msg_2)
+		msg = x86._mm_shuffle_epi32(msg, 0x0e)
+		state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
+		msg_1 = x86._mm_sha256msg1_epu32(msg_1, msg_2)
+
+		// Rounds 44-47
+		msg = x86._mm_add_epi32(msg_3, x86.__m128i(K_11))
+		state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
+		tmp = x86._mm_alignr_epi8(msg_3, msg_2, 4)
+		msg_0 = x86._mm_add_epi32(msg_0, tmp)
+		msg_0 = x86._mm_sha256msg2_epu32(msg_0, msg_3)
+		msg = x86._mm_shuffle_epi32(msg, 0x0e)
+		state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
+		msg_2 = x86._mm_sha256msg1_epu32(msg_2, msg_3)
+
+		// Rounds 48-51
+		msg = x86._mm_add_epi32(msg_0, x86.__m128i(K_12))
+		state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
+		tmp = x86._mm_alignr_epi8(msg_0, msg_3, 4)
+		msg_1 = x86._mm_add_epi32(msg_1, tmp)
+		msg_1 = x86._mm_sha256msg2_epu32(msg_1, msg_0)
+		msg = x86._mm_shuffle_epi32(msg, 0x0e)
+		state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
+		msg_3 = x86._mm_sha256msg1_epu32(msg_3, msg_0)
+
+		// Rounds 52-55
+		msg = x86._mm_add_epi32(msg_1, x86.__m128i(K_13))
+		state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
+		tmp = x86._mm_alignr_epi8(msg_1, msg_0, 4)
+		msg_2 = x86._mm_add_epi32(msg_2, tmp)
+		msg_2 = x86._mm_sha256msg2_epu32(msg_2, msg_1)
+		msg = x86._mm_shuffle_epi32(msg, 0x0e)
+		state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
+
+		/* Rounds 56-59 */
+		msg = x86._mm_add_epi32(msg_2, x86.__m128i(K_14))
+		state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
+		tmp = x86._mm_alignr_epi8(msg_2, msg_1, 4)
+		msg_3 = x86._mm_add_epi32(msg_3, tmp)
+		msg_3 = x86._mm_sha256msg2_epu32(msg_3, msg_2)
+		msg = x86._mm_shuffle_epi32(msg, 0x0e)
+		state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
+
+		// Rounds 60-63
+		msg = x86._mm_add_epi32(msg_3, x86.__m128i(K_15))
+		state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg)
+		msg = x86._mm_shuffle_epi32(msg, 0x0e)
+		state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg)
+
+		state_0 = x86._mm_add_epi32(state_0, state_0_save)
+		state_1 = x86._mm_add_epi32(state_1, state_1_save)
+
+		data = data[BLOCK_SIZE_256:]
+	}
+
+	// Write back the updated state
+	tmp = x86._mm_shuffle_epi32(state_0, 0x1b)        // FEBA
+	state_1 = x86._mm_shuffle_epi32(state_1, 0xb1)    // DCHG
+	// state_0 = x86._mm_blend_epi16(tmp, state_1, 0xf0) // DCBA
+	state_0 = kludge_mm_blend_epi16_0xf0(tmp, state_1)
+	state_1 = x86._mm_alignr_epi8(state_1, tmp, 8)    // ABEF
+
+	intrinsics.unaligned_store((^x86.__m128i)(&ctx.h[0]), state_0)
+	intrinsics.unaligned_store((^x86.__m128i)(&ctx.h[4]), state_1)
+}
+
+@(private = "file")
+kludge_mm_blend_epi16_0xf0 :: #force_inline proc "contextless"(a, b: x86.__m128i) -> x86.__m128i {
+	// HACK HACK HACK: LLVM got rid of `llvm.x86.sse41.pblendw`.
+	a_ := simd.to_array(a)
+	b_ := simd.to_array(b)
+	return x86.__m128i{a_[0], b_[1]}
+}
@@ -219,18 +219,14 @@ verify_4_8 :: proc {
 */

 init :: proc(ctx: ^Context, key: []byte, c_rounds, d_rounds: int) {
-	if len(key) != KEY_SIZE {
-		panic("crypto/siphash; invalid key size")
-	}
+	ensure(len(key) == KEY_SIZE,"crypto/siphash; invalid key size")
 	ctx.c_rounds = c_rounds
 	ctx.d_rounds = d_rounds
 	is_valid_setting :=
 		(ctx.c_rounds == 1 && ctx.d_rounds == 3) ||
 		(ctx.c_rounds == 2 && ctx.d_rounds == 4) ||
 		(ctx.c_rounds == 4 && ctx.d_rounds == 8)
-	if !is_valid_setting {
-		panic("crypto/siphash: incorrect rounds set up")
-	}
+	ensure(is_valid_setting, "crypto/siphash: incorrect rounds set up")
 	ctx.k0 = endian.unchecked_get_u64le(key[:8])
 	ctx.k1 = endian.unchecked_get_u64le(key[8:])
 	ctx.v0 = 0x736f6d6570736575 ~ ctx.k0
@@ -245,7 +241,7 @@ init :: proc(ctx: ^Context, key: []byte, c_rounds, d_rounds: int) {
 }

 update :: proc(ctx: ^Context, data: []byte) {
-	assert(ctx.is_initialized, "crypto/siphash: context is not initialized")
+	ensure(ctx.is_initialized)

 	data := data
 	ctx.total_length += len(data)
@@ -269,7 +265,7 @@ update :: proc(ctx: ^Context, data: []byte) {
 }

 final :: proc(ctx: ^Context, dst: ^u64) {
-	assert(ctx.is_initialized, "crypto/siphash: context is not initialized")
+	ensure(ctx.is_initialized)

 	tmp: [BLOCK_SIZE]byte
 	copy(tmp[:], ctx.buf[:ctx.last_block])
@@ -336,9 +332,8 @@ _get_byte :: #force_inline proc "contextless" (byte_num: byte, into: u64) -> byt

@(private)
 _collect_output :: #force_inline proc(dst: []byte, hash: u64) {
-	if len(dst) < DIGEST_SIZE {
-		panic("crypto/siphash: invalid tag size")
-	}
+	ensure(len(dst) >= DIGEST_SIZE, "crypto/siphash: invalid tag size")
+
 	dst[0] = _get_byte(7, hash)
 	dst[1] = _get_byte(6, hash)
 	dst[2] = _get_byte(5, hash)
@@ -53,7 +53,7 @@ init :: proc(ctx: ^Context) {

 // update adds more data to the Context.
 update :: proc(ctx: ^Context, data: []byte) {
-	assert(ctx.is_initialized)
+	ensure(ctx.is_initialized)

 	data := data
 	ctx.length += u64(len(data))
@@ -83,11 +83,8 @@ update :: proc(ctx: ^Context, data: []byte) {
 // Iff finalize_clone is set, final will work on a copy of the Context,
 // which is useful for for calculating rolling digests.
 final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
-	assert(ctx.is_initialized)
-
-	if len(hash) < DIGEST_SIZE {
-		panic("crypto/sm3: invalid destination digest size")
-	}
+	ensure(ctx.is_initialized)
+	ensure(len(hash) >= DIGEST_SIZE, "crypto/sm3: invalid destination digest size")

 	ctx := ctx
 	if finalize_clone {
@@ -110,7 +107,7 @@ final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) {
 	length <<= 3
 	endian.unchecked_put_u64be(pad[:], length)
 	update(ctx, pad[0:8])
-	assert(ctx.bitlength == 0)
+	assert(ctx.bitlength == 0) // Check for bugs

 	for i := 0; i < DIGEST_SIZE / 4; i += 1 {
 		endian.unchecked_put_u32be(hash[i * 4:], ctx.state[i])
@@ -136,7 +133,7 @@ reset :: proc(ctx: ^Context) {
    SM3 implementation
 */

-@(private)
+@(private, rodata)
 IV := [8]u32 {
 	0x7380166f, 0x4914b2b9, 0x172442d7, 0xda8a0600,
 	0xa96f30bc, 0x163138aa, 0xe38dee4d, 0xb0fb0e4e,
@@ -15,7 +15,7 @@ SCALAR_SIZE :: 32
 // POINT_SIZE is the size of a X25519 point (public key/shared secret) in bytes.
 POINT_SIZE :: 32

-@(private)
+@(private, rodata)
 _BASE_POINT: [32]byte = {9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}

@(private)
@@ -101,15 +101,9 @@ _scalarmult :: proc "contextless" (out, scalar, point: ^[32]byte) {
 // scalarmult "multiplies" the provided scalar and point, and writes the
 // resulting point to dst.
 scalarmult :: proc(dst, scalar, point: []byte) {
-	if len(scalar) != SCALAR_SIZE {
-		panic("crypto/x25519: invalid scalar size")
-	}
-	if len(point) != POINT_SIZE {
-		panic("crypto/x25519: invalid point size")
-	}
-	if len(dst) != POINT_SIZE {
-		panic("crypto/x25519: invalid destination point size")
-	}
+	ensure(len(scalar) == SCALAR_SIZE, "crypto/x25519: invalid scalar size")
+	ensure(len(point) == POINT_SIZE, "crypto/x25519: invalid point size")
+	ensure(len(dst) == POINT_SIZE, "crypto/x25519: invalid destination point size")

 	// "clamp" the scalar
 	e: [32]byte = ---
@@ -0,0 +1,155 @@
+/*
+package x448 implements the X448 (aka curve448) Elliptic-Curve
+Diffie-Hellman key exchange protocol.
+
+See:
+- [[ https://www.rfc-editor.org/rfc/rfc7748 ]]
+*/
+package x448
+
+import field "core:crypto/_fiat/field_curve448"
+import "core:mem"
+
+// SCALAR_SIZE is the size of a X448 scalar (private key) in bytes.
+SCALAR_SIZE :: 56
+// POINT_SIZE is the size of a X448 point (public key/shared secret) in bytes.
+POINT_SIZE :: 56
+
+@(private, rodata)
+_BASE_POINT: [56]byte = {
+	5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+}
+
+@(private)
+_scalar_bit :: #force_inline proc "contextless" (s: ^[56]byte, i: int) -> u8 {
+	if i < 0 {
+		return 0
+	}
+	return (s[i >> 3] >> uint(i & 7)) & 1
+}
+
+@(private)
+_scalarmult :: proc "contextless" (out, scalar, point: ^[56]byte) {
+	// Montgomery pseudo-multiplication, using the RFC 7748 formula.
+	t1, t2: field.Loose_Field_Element = ---, ---
+
+	// x_1 = u
+	// x_2 = 1
+	// z_2 = 0
+	// x_3 = u
+	// z_3 = 1
+	x1: field.Tight_Field_Element = ---
+	field.fe_from_bytes(&x1, point)
+
+	x2, x3, z2, z3: field.Tight_Field_Element = ---, ---, ---, ---
+	field.fe_one(&x2)
+	field.fe_zero(&z2)
+	field.fe_set(&x3, &x1)
+	field.fe_one(&z3)
+
+	// swap = 0
+	swap: int
+
+	// For t = bits-1 down to 0:a
+	for t := 448 - 1; t >= 0; t -= 1 {
+		// k_t = (k >> t) & 1
+		k_t := int(_scalar_bit(scalar, t))
+		// swap ^= k_t
+		swap ~= k_t
+		// Conditional swap; see text below.
+		// (x_2, x_3) = cswap(swap, x_2, x_3)
+		field.fe_cond_swap(&x2, &x3, swap)
+		// (z_2, z_3) = cswap(swap, z_2, z_3)
+		field.fe_cond_swap(&z2, &z3, swap)
+		// swap = k_t
+		swap = k_t
+
+		// Note: This deliberately omits reductions after add/sub operations
+		// if the result is only ever used as the input to a mul/square since
+		// the implementations of those can deal with non-reduced inputs.
+		//
+		// fe_tighten_cast is only used to store a fully reduced
+		// output in a Loose_Field_Element, or to provide such a
+		// Loose_Field_Element as a Tight_Field_Element argument.
+
+		// A = x_2 + z_2
+		field.fe_add(&t1, &x2, &z2)
+		// B = x_2 - z_2
+		field.fe_sub(&t2, &x2, &z2)
+		// D = x_3 - z_3
+		field.fe_sub(field.fe_relax_cast(&z2), &x3, &z3) // (z2 unreduced)
+		// DA = D * A
+		field.fe_carry_mul(&x2, field.fe_relax_cast(&z2), &t1)
+		// C = x_3 + z_3
+		field.fe_add(field.fe_relax_cast(&z3), &x3, &z3) // (z3 unreduced)
+		// CB = C * B
+		field.fe_carry_mul(&x3, &t2, field.fe_relax_cast(&z3))
+		// z_3 = x_1 * (DA - CB)^2
+		field.fe_sub(field.fe_relax_cast(&z3), &x2, &x3) // (z3 unreduced)
+		field.fe_carry_square(&z3, field.fe_relax_cast(&z3))
+		field.fe_carry_mul(&z3, field.fe_relax_cast(&x1), field.fe_relax_cast(&z3))
+		// x_3 = (DA + CB)^2
+		field.fe_add(field.fe_relax_cast(&z2), &x2, &x3) // (z2 unreduced)
+		field.fe_carry_square(&x3, field.fe_relax_cast(&z2))
+
+		// AA = A^2
+		field.fe_carry_square(&z2, &t1)
+		// BB = B^2
+		field.fe_carry_square(field.fe_tighten_cast(&t1), &t2) // (t1 reduced)
+		// x_2 = AA * BB
+		field.fe_carry_mul(&x2, field.fe_relax_cast(&z2), &t1)
+		// E = AA - BB
+		field.fe_sub(&t2, &z2, field.fe_tighten_cast(&t1)) // (t1 (input) is reduced)
+		// z_2 = E * (AA + a24 * E)
+		field.fe_carry_mul_small(field.fe_tighten_cast(&t1), &t2, 39081) // (t1 reduced)
+		field.fe_add(&t1, &z2, field.fe_tighten_cast(&t1)) // (t1 (input) is reduced)
+		field.fe_carry_mul(&z2, &t2, &t1)
+	}
+
+	// Conditional swap; see text below.
+	// (x_2, x_3) = cswap(swap, x_2, x_3)
+	field.fe_cond_swap(&x2, &x3, swap)
+	// (z_2, z_3) = cswap(swap, z_2, z_3)
+	field.fe_cond_swap(&z2, &z3, swap)
+
+	// Return x_2 * (z_2^(p - 2))
+	field.fe_carry_inv(&z2, field.fe_relax_cast(&z2))
+	field.fe_carry_mul(&x2, field.fe_relax_cast(&x2), field.fe_relax_cast(&z2))
+	field.fe_to_bytes(out, &x2)
+
+	field.fe_clear_vec([]^field.Tight_Field_Element{&x1, &x2, &x3, &z2, &z3})
+	field.fe_clear_vec([]^field.Loose_Field_Element{&t1, &t2})
+}
+
+// scalarmult "multiplies" the provided scalar and point, and writes the
+// resulting point to dst.
+scalarmult :: proc(dst, scalar, point: []byte) {
+	ensure(len(scalar) == SCALAR_SIZE, "crypto/x448: invalid scalar size")
+	ensure(len(point) == POINT_SIZE, "crypto/x448: invalid point size")
+	ensure(len(dst) == POINT_SIZE, "crypto/x448: invalid destination point size")
+
+	// "clamp" the scalar
+	e: [56]byte = ---
+	copy_slice(e[:], scalar)
+	e[0] &= 252
+	e[55] |= 128
+
+	p: [56]byte = ---
+	copy_slice(p[:], point)
+
+	d: [56]byte = ---
+	_scalarmult(&d, &e, &p)
+	copy_slice(dst, d[:])
+
+	mem.zero_explicit(&e, size_of(e))
+	mem.zero_explicit(&d, size_of(d))
+}
+
+// scalarmult_basepoint "multiplies" the provided scalar with the X448
+// base point and writes the resulting point to dst.
+scalarmult_basepoint :: proc(dst, scalar: []byte) {
+	scalarmult(dst, scalar, _BASE_POINT[:])
+}
@@ -49,7 +49,9 @@ _resolve :: proc(ctx: ^Context, frame: Frame, allocator: runtime.Allocator) -> (

 	data: [size_of(win32.SYMBOL_INFOW) + size_of([256]win32.WCHAR)]byte
 	symbol := (^win32.SYMBOL_INFOW)(&data[0])
-	symbol.SizeOfStruct = size_of(symbol)
+	// The value of SizeOfStruct must be the size of the whole struct,
+	// not just the size of the pointer
+	symbol.SizeOfStruct = size_of(symbol^)
 	symbol.MaxNameLen = 255
 	if win32.SymFromAddrW(ctx.impl.hProcess, win32.DWORD64(frame), &{}, symbol) {
 		fl.procedure, _ = win32.wstring_to_utf8(&symbol.Name[0], -1, allocator)
@@ -888,6 +888,34 @@ make_aligned :: proc(
 	return runtime.make_aligned(T, len, alignment, allocator, loc)
 }

+
+/*
+Allocate a new slice with alignment for allocators that might not support the
+specified alignment requirement.
+
+This procedure allocates a new slice of type `T` with length `len`, aligned
+on a boundary specified by `alignment` from an allocator specified by
+`allocator`, and returns the allocated slice.
+
+The user should `delete` the return `original_data` slice not the typed `slice`.
+*/
+@(require_results)
+make_over_aligned :: proc(
+	$T: typeid/[]$E,
+	#any_int len: int,
+	alignment: int,
+	allocator: runtime.Allocator,
+	loc := #caller_location,
+) -> (slice: T, original_data: []byte, err: Allocator_Error) {
+	size := size_of(E)*len + alignment-1
+	original_data, err = runtime.make([]byte, size, allocator, loc)
+	if err == nil {
+		ptr := align_forward(raw_data(original_data), uintptr(alignment))
+		slice = ([^]E)(ptr)[:len]
+	}
+	return
+}
+
 /*
 Allocate a new slice.

@@ -171,16 +171,15 @@ If the return value is:

 The comparison is performed as follows:
 1. Each byte, upto `min(len(a), len(b))` bytes is compared between `a` and `b`.
-  - If the byte in slice `a` is smaller than a byte in slice `b`, then comparison
-  stops and this procedure returns `-1`.
-  - If the byte in slice `a` is bigger than a byte in slice `b`, then comparison
-  stops and this procedure returns `+1`.
-  - Otherwise the comparison continues until `min(len(a), len(b))` are compared.
-2. If all the bytes in the range are equal, then the lengths of the slices are
-  compared.
-  - If the length of slice `a` is smaller than the length of slice `b`, then `-1` is returned.
-  - If the length of slice `b` is smaller than the length of slice `b`, then `+1` is returned.
-  - Otherwise `0` is returned.
+	- If the byte in slice `a` is smaller than a byte in slice `b`, then comparison
+	  stops and this procedure returns `-1`.
+	- If the byte in slice `a` is bigger than a byte in slice `b`, then comparison
+	  stops and this procedure returns `+1`.
+	- Otherwise the comparison continues until `min(len(a), len(b))` are compared.
+2. If all the bytes in the range are equal, then the lengths of the slices are compared.
+	- If the length of slice `a` is smaller than the length of slice `b`, then `-1` is returned.
+	- If the length of slice `b` is smaller than the length of slice `b`, then `+1` is returned.
+	- Otherwise `0` is returned.
 */
@(require_results)
 compare :: proc "contextless" (a, b: []byte) -> int {
@@ -207,11 +206,11 @@ If the return value is:

 The comparison is performed as follows:
 1. Each byte, upto `n` bytes is compared between `a` and `b`.
-  - If the byte in `a` is smaller than a byte in `b`, then comparison stops
-  and this procedure returns `-1`.
-  - If the byte in `a` is bigger than a byte in `b`, then comparison stops
-  and this procedure returns `+1`.
-  - Otherwise the comparison continues until `n` bytes are compared.
+	- If the byte in `a` is smaller than a byte in `b`, then comparison stops
+	  and this procedure returns `-1`.
+	- If the byte in `a` is bigger than a byte in `b`, then comparison stops
+	  and this procedure returns `+1`.
+	- Otherwise the comparison continues until `n` bytes are compared.
 2. If all the bytes in the range are equal, this procedure returns `0`.
 */
@(require_results)
@@ -233,11 +232,11 @@ If the return value is:

 The comparison is performed as follows:
 1. Each byte, upto `n` bytes is compared between `a` and `b`.
-  - If the byte in `a` is smaller than a byte in `b`, then comparison stops
-  and this procedure returns `-1`.
-  - If the byte in `a` is bigger than a byte in `b`, then comparison stops
-  and this procedure returns `+1`.
-  - Otherwise the comparison continues until `n` bytes are compared.
+	- If the byte in `a` is smaller than a byte in `b`, then comparison stops
+	  and this procedure returns `-1`.
+	- If the byte in `a` is bigger than a byte in `b`, then comparison stops
+	  and this procedure returns `+1`.
+	- Otherwise the comparison continues until `n` bytes are compared.
 2. If all the bytes in the range are equal, this procedure returns `0`.
 */
@(require_results)
@@ -21,188 +21,191 @@ package net
 */

 import "core:c"
-import "core:os"
+import "core:sys/posix"
+
+@(private)
+ESHUTDOWN :: 58

 Create_Socket_Error :: enum c.int {
 	None                                 = 0,
-	Family_Not_Supported_For_This_Socket = c.int(os.EAFNOSUPPORT),
-	No_Socket_Descriptors_Available      = c.int(os.EMFILE),
-	No_Buffer_Space_Available            = c.int(os.ENOBUFS),
-	No_Memory_Available_Available        = c.int(os.ENOMEM),
-	Protocol_Unsupported_By_System       = c.int(os.EPROTONOSUPPORT),
-	Wrong_Protocol_For_Socket            = c.int(os.EPROTONOSUPPORT),
-	Family_And_Socket_Type_Mismatch      = c.int(os.EPROTONOSUPPORT),
+	Family_Not_Supported_For_This_Socket = c.int(posix.EAFNOSUPPORT),
+	No_Socket_Descriptors_Available      = c.int(posix.EMFILE),
+	No_Buffer_Space_Available            = c.int(posix.ENOBUFS),
+	No_Memory_Available                  = c.int(posix.ENOMEM),
+	Protocol_Unsupported_By_System       = c.int(posix.EPROTONOSUPPORT),
+	Wrong_Protocol_For_Socket            = c.int(posix.EPROTONOSUPPORT),
+	Family_And_Socket_Type_Mismatch      = c.int(posix.EPROTONOSUPPORT),
 }

 Dial_Error :: enum c.int {
 	None                      = 0,
 	Port_Required             = -1, // Attempted to dial an endpointing without a port being set.

-	Address_In_Use            = c.int(os.EADDRINUSE),
-	In_Progress               = c.int(os.EINPROGRESS),
-	Cannot_Use_Any_Address    = c.int(os.EADDRNOTAVAIL),
-	Wrong_Family_For_Socket   = c.int(os.EAFNOSUPPORT),
-	Refused                   = c.int(os.ECONNREFUSED),
-	Is_Listening_Socket       = c.int(os.EACCES),
-	Already_Connected         = c.int(os.EISCONN),
-	Network_Unreachable       = c.int(os.ENETUNREACH),  // Device is offline
-	Host_Unreachable          = c.int(os.EHOSTUNREACH), // Remote host cannot be reached
-	No_Buffer_Space_Available = c.int(os.ENOBUFS),
-	Not_Socket                = c.int(os.ENOTSOCK),
-	Timeout                   = c.int(os.ETIMEDOUT),
+	Address_In_Use            = c.int(posix.EADDRINUSE),
+	In_Progress               = c.int(posix.EINPROGRESS),
+	Cannot_Use_Any_Address    = c.int(posix.EADDRNOTAVAIL),
+	Wrong_Family_For_Socket   = c.int(posix.EAFNOSUPPORT),
+	Refused                   = c.int(posix.ECONNREFUSED),
+	Is_Listening_Socket       = c.int(posix.EACCES),
+	Already_Connected         = c.int(posix.EISCONN),
+	Network_Unreachable       = c.int(posix.ENETUNREACH),  // Device is offline
+	Host_Unreachable          = c.int(posix.EHOSTUNREACH), // Remote host cannot be reached
+	No_Buffer_Space_Available = c.int(posix.ENOBUFS),
+	Not_Socket                = c.int(posix.ENOTSOCK),
+	Timeout                   = c.int(posix.ETIMEDOUT),

 	// TODO: we may need special handling for this; maybe make a socket a struct with metadata?
-	Would_Block               = c.int(os.EWOULDBLOCK), 
+	Would_Block               = c.int(posix.EWOULDBLOCK), 
 }

 Bind_Error :: enum c.int {
 	None                         = 0,
 	Privileged_Port_Without_Root = -1, // Attempted to bind to a port less than 1024 without root access.

-	Address_In_Use          = c.int(os.EADDRINUSE),    // Another application is currently bound to this endpoint.
-	Given_Nonlocal_Address  = c.int(os.EADDRNOTAVAIL), // The address is not a local address on this machine.
-	Broadcast_Disabled      = c.int(os.EACCES),        // To bind a UDP socket to the broadcast address, the appropriate socket option must be set.
-	Address_Family_Mismatch = c.int(os.EFAULT),        // The address family of the address does not match that of the socket.
-	Already_Bound           = c.int(os.EINVAL),        // The socket is already bound to an address.
-	No_Ports_Available      = c.int(os.ENOBUFS),       // There are not enough ephemeral ports available.
+	Address_In_Use          = c.int(posix.EADDRINUSE),    // Another application is currently bound to this endpoint.
+	Given_Nonlocal_Address  = c.int(posix.EADDRNOTAVAIL), // The address is not a local address on this machine.
+	Broadcast_Disabled      = c.int(posix.EACCES),        // To bind a UDP socket to the broadcast address, the appropriate socket option must be set.
+	Address_Family_Mismatch = c.int(posix.EFAULT),        // The address family of the address does not match that of the socket.
+	Already_Bound           = c.int(posix.EINVAL),        // The socket is already bound to an address.
+	No_Ports_Available      = c.int(posix.ENOBUFS),       // There are not enough ephemeral ports available.
 }

 Listen_Error :: enum c.int {
 	None                                    = 0,
-	Address_In_Use                          = c.int(os.EADDRINUSE),
-	Already_Connected                       = c.int(os.EISCONN),
-	No_Socket_Descriptors_Available         = c.int(os.EMFILE),
-	No_Buffer_Space_Available               = c.int(os.ENOBUFS),
-	Nonlocal_Address                        = c.int(os.EADDRNOTAVAIL),
-	Not_Socket                              = c.int(os.ENOTSOCK),
-	Listening_Not_Supported_For_This_Socket = c.int(os.EOPNOTSUPP),
+	Address_In_Use                          = c.int(posix.EADDRINUSE),
+	Already_Connected                       = c.int(posix.EISCONN),
+	No_Socket_Descriptors_Available         = c.int(posix.EMFILE),
+	No_Buffer_Space_Available               = c.int(posix.ENOBUFS),
+	Nonlocal_Address                        = c.int(posix.EADDRNOTAVAIL),
+	Not_Socket                              = c.int(posix.ENOTSOCK),
+	Listening_Not_Supported_For_This_Socket = c.int(posix.EOPNOTSUPP),
 }

 Accept_Error :: enum c.int {
 	None                                              = 0,
 	// TODO(tetra): Is this error actually possible here? Or is like Linux, in which case we can remove it.
-	Reset                                             = c.int(os.ECONNRESET), 
-	Not_Listening                                     = c.int(os.EINVAL),
-	No_Socket_Descriptors_Available_For_Client_Socket = c.int(os.EMFILE),
-	No_Buffer_Space_Available                         = c.int(os.ENOBUFS),
-	Not_Socket                                        = c.int(os.ENOTSOCK),
-	Not_Connection_Oriented_Socket                    = c.int(os.EOPNOTSUPP),
+	Reset                                             = c.int(posix.ECONNRESET), 
+	Not_Listening                                     = c.int(posix.EINVAL),
+	No_Socket_Descriptors_Available_For_Client_Socket = c.int(posix.EMFILE),
+	No_Buffer_Space_Available                         = c.int(posix.ENOBUFS),
+	Not_Socket                                        = c.int(posix.ENOTSOCK),
+	Not_Connection_Oriented_Socket                    = c.int(posix.EOPNOTSUPP),

 	// TODO: we may need special handling for this; maybe make a socket a struct with metadata?
-	Would_Block                                       = c.int(os.EWOULDBLOCK), 
+	Would_Block                                       = c.int(posix.EWOULDBLOCK), 
 }

 TCP_Recv_Error :: enum c.int {
 	None              = 0,
-	Shutdown          = c.int(os.ESHUTDOWN),
-	Not_Connected     = c.int(os.ENOTCONN),
+	Shutdown          = ESHUTDOWN,
+	Not_Connected     = c.int(posix.ENOTCONN),

 	// TODO(tetra): Is this error actually possible here?
-	Connection_Broken = c.int(os.ENETRESET),
-	Not_Socket        = c.int(os.ENOTSOCK),
-	Aborted           = c.int(os.ECONNABORTED),
+	Connection_Broken = c.int(posix.ENETRESET),
+	Not_Socket        = c.int(posix.ENOTSOCK),
+	Aborted           = c.int(posix.ECONNABORTED),

 	// TODO(tetra): Determine when this is different from the syscall returning n=0 and maybe normalize them?
-	Connection_Closed = c.int(os.ECONNRESET),
-	Offline           = c.int(os.ENETDOWN),
-	Host_Unreachable  = c.int(os.EHOSTUNREACH),
-	Interrupted       = c.int(os.EINTR),
+	Connection_Closed = c.int(posix.ECONNRESET),
+	Offline           = c.int(posix.ENETDOWN),
+	Host_Unreachable  = c.int(posix.EHOSTUNREACH),
+	Interrupted       = c.int(posix.EINTR),

 	// NOTE: No, really. Presumably this means something different for nonblocking sockets...
-	Timeout           = c.int(os.EWOULDBLOCK),
+	Timeout           = c.int(posix.EWOULDBLOCK),
 }

 UDP_Recv_Error :: enum c.int {
 	None             = 0,
-	Buffer_Too_Small = c.int(os.EMSGSIZE), // The buffer is too small to fit the entire message, and the message was truncated. When this happens, the rest of message is lost.
-	Not_Socket       = c.int(os.ENOTSOCK), // The so-called socket is not an open socket.
-	Not_Descriptor   = c.int(os.EBADF),    // The so-called socket is, in fact, not even a valid descriptor.
-	Bad_Buffer       = c.int(os.EFAULT),   // The buffer did not point to a valid location in memory.
-	Interrupted      = c.int(os.EINTR),    // A signal occurred before any data was transmitted. See signal(7).
+	Buffer_Too_Small = c.int(posix.EMSGSIZE), // The buffer is too small to fit the entire message, and the message was truncated. When this happens, the rest of message is lost.
+	Not_Socket       = c.int(posix.ENOTSOCK), // The so-called socket is not an open socket.
+	Not_Descriptor   = c.int(posix.EBADF),    // The so-called socket is, in fact, not even a valid descriptor.
+	Bad_Buffer       = c.int(posix.EFAULT),   // The buffer did not point to a valid location in memory.
+	Interrupted      = c.int(posix.EINTR),    // A signal occurred before any data was transmitted. See signal(7).

 	// The send timeout duration passed before all data was sent. See Socket_Option.Send_Timeout.
 	// NOTE: No, really. Presumably this means something different for nonblocking sockets...
-	Timeout          = c.int(os.EWOULDBLOCK), 
-	Socket_Not_Bound = c.int(os.EINVAL), // The socket must be bound for this operation, but isn't.
+	Timeout          = c.int(posix.EWOULDBLOCK), 
+	Socket_Not_Bound = c.int(posix.EINVAL), // The socket must be bound for this operation, but isn't.
 }

 TCP_Send_Error :: enum c.int {
 	None                      = 0,

-	Aborted                   = c.int(os.ECONNABORTED), 
-	Connection_Closed         = c.int(os.ECONNRESET),
-	Not_Connected             = c.int(os.ENOTCONN),
-	Shutdown                  = c.int(os.ESHUTDOWN),
+	Aborted                   = c.int(posix.ECONNABORTED), 
+	Connection_Closed         = c.int(posix.ECONNRESET),
+	Not_Connected             = c.int(posix.ENOTCONN),
+	Shutdown                  = ESHUTDOWN,

 	// The send queue was full.
 	// This is usually a transient issue.
 	//
 	// This also shouldn't normally happen on Linux, as data is dropped if it
 	// doesn't fit in the send queue.
-	No_Buffer_Space_Available = c.int(os.ENOBUFS),
-	Offline                   = c.int(os.ENETDOWN),
-	Host_Unreachable          = c.int(os.EHOSTUNREACH),
-	Interrupted               = c.int(os.EINTR), // A signal occurred before any data was transmitted. See signal(7).
+	No_Buffer_Space_Available = c.int(posix.ENOBUFS),
+	Offline                   = c.int(posix.ENETDOWN),
+	Host_Unreachable          = c.int(posix.EHOSTUNREACH),
+	Interrupted               = c.int(posix.EINTR), // A signal occurred before any data was transmitted. See signal(7).

 	// NOTE: No, really. Presumably this means something different for nonblocking sockets...
 	// The send timeout duration passed before all data was sent. See Socket_Option.Send_Timeout.
-	Timeout                   = c.int(os.EWOULDBLOCK), 
-	Not_Socket                = c.int(os.ENOTSOCK), // The so-called socket is not an open socket.
+	Timeout                   = c.int(posix.EWOULDBLOCK), 
+	Not_Socket                = c.int(posix.ENOTSOCK), // The so-called socket is not an open socket.
 }

 // TODO
 UDP_Send_Error :: enum c.int {
 	None                        = 0,
-	Message_Too_Long            = c.int(os.EMSGSIZE), // The message is larger than the maximum UDP packet size. No data was sent.
+	Message_Too_Long            = c.int(posix.EMSGSIZE), // The message is larger than the maximum UDP packet size. No data was sent.

 	// TODO: not sure what the exact circumstances for this is yet
-	Network_Unreachable         = c.int(os.ENETUNREACH),
-	No_Outbound_Ports_Available = c.int(os.EAGAIN),   // There are no more emphemeral outbound ports available to bind the socket to, in order to send.
+	Network_Unreachable         = c.int(posix.ENETUNREACH),
+	No_Outbound_Ports_Available = c.int(posix.EAGAIN),   // There are no more emphemeral outbound ports available to bind the socket to, in order to send.

 	// The send timeout duration passed before all data was sent. See Socket_Option.Send_Timeout.
 	// NOTE: No, really. Presumably this means something different for nonblocking sockets...
-	Timeout                     = c.int(os.EWOULDBLOCK), 
-	Not_Socket                  = c.int(os.ENOTSOCK), // The so-called socket is not an open socket.
-	Not_Descriptor              = c.int(os.EBADF),    // The so-called socket is, in fact, not even a valid descriptor.
-	Bad_Buffer                  = c.int(os.EFAULT),   // The buffer did not point to a valid location in memory.
-	Interrupted                 = c.int(os.EINTR),    // A signal occurred before any data was transmitted. See signal(7).
+	Timeout                     = c.int(posix.EWOULDBLOCK), 
+	Not_Socket                  = c.int(posix.ENOTSOCK), // The so-called socket is not an open socket.
+	Not_Descriptor              = c.int(posix.EBADF),    // The so-called socket is, in fact, not even a valid descriptor.
+	Bad_Buffer                  = c.int(posix.EFAULT),   // The buffer did not point to a valid location in memory.
+	Interrupted                 = c.int(posix.EINTR),    // A signal occurred before any data was transmitted. See signal(7).

 	// The send queue was full.
 	// This is usually a transient issue.
 	//
 	// This also shouldn't normally happen on Linux, as data is dropped if it
 	// doesn't fit in the send queue.
-	No_Buffer_Space_Available   = c.int(os.ENOBUFS),
-	No_Memory_Available         = c.int(os.ENOMEM),   // No memory was available to properly manage the send queue.
+	No_Buffer_Space_Available   = c.int(posix.ENOBUFS),
+	No_Memory_Available         = c.int(posix.ENOMEM),   // No memory was available to properly manage the send queue.
 }

 Shutdown_Manner :: enum c.int {
-	Receive = c.int(os.SHUT_RD),
-	Send    = c.int(os.SHUT_WR),
-	Both    = c.int(os.SHUT_RDWR),
+	Receive = c.int(posix.SHUT_RD),
+	Send    = c.int(posix.SHUT_WR),
+	Both    = c.int(posix.SHUT_RDWR),
 }

 Shutdown_Error :: enum c.int {
 	None           = 0,
-	Aborted        = c.int(os.ECONNABORTED),
-	Reset          = c.int(os.ECONNRESET),
-	Offline        = c.int(os.ENETDOWN),
-	Not_Connected  = c.int(os.ENOTCONN),
-	Not_Socket     = c.int(os.ENOTSOCK),
-	Invalid_Manner = c.int(os.EINVAL),
+	Aborted        = c.int(posix.ECONNABORTED),
+	Reset          = c.int(posix.ECONNRESET),
+	Offline        = c.int(posix.ENETDOWN),
+	Not_Connected  = c.int(posix.ENOTCONN),
+	Not_Socket     = c.int(posix.ENOTSOCK),
+	Invalid_Manner = c.int(posix.EINVAL),
 }

 Socket_Option_Error :: enum c.int {
 	None                       = 0,
-	Offline                    = c.int(os.ENETDOWN),
-	Timeout_When_Keepalive_Set = c.int(os.ENETRESET),
-	Invalid_Option_For_Socket  = c.int(os.ENOPROTOOPT),
-	Reset_When_Keepalive_Set   = c.int(os.ENOTCONN),
-	Not_Socket                 = c.int(os.ENOTSOCK),
+	Offline                    = c.int(posix.ENETDOWN),
+	Timeout_When_Keepalive_Set = c.int(posix.ENETRESET),
+	Invalid_Option_For_Socket  = c.int(posix.ENOPROTOOPT),
+	Reset_When_Keepalive_Set   = c.int(posix.ENOTCONN),
+	Not_Socket                 = c.int(posix.ENOTSOCK),
 }

 Set_Blocking_Error :: enum c.int {
 	None = 0,

 	// TODO: Add errors for `set_blocking`
-}
+}
@@ -20,60 +20,57 @@ package net
 		Feoramund:       FreeBSD platform code
 */

-import "core:os"
 import "core:strings"
+import "core:sys/posix"
+
+foreign import lib "system:System.framework"

@(private)
 _enumerate_interfaces :: proc(allocator := context.allocator) -> (interfaces: []Network_Interface, err: Network_Error) {
 	context.allocator = allocator

-	head: ^os.ifaddrs
-
-	if res := os._getifaddrs(&head); res < 0 {
+	head: ^ifaddrs
+	if getifaddrs(&head) != .OK {
 		return {}, .Unable_To_Enumerate_Network_Interfaces
 	}
+	defer freeifaddrs(head)

-	/*
-		Unlike Windows, *nix regrettably doesn't return all it knows about an interface in one big struct.
-		We're going to have to iterate over a list and coalesce information as we go.
-	*/
-	ifaces: map[string]^Network_Interface
+	ifaces: map[string]Network_Interface
 	defer delete(ifaces)

 	for ifaddr := head; ifaddr != nil; ifaddr = ifaddr.next {
 		adapter_name := string(ifaddr.name)

-		/*
-			Check if we have seen this interface name before so we can reuse the `Network_Interface`.
-			Else, create a new one.
-		*/
-		if adapter_name not_in ifaces {
-			ifaces[adapter_name] = new(Network_Interface)
-			ifaces[adapter_name].adapter_name = strings.clone(adapter_name)
+		key_ptr, iface, inserted, mem_err := map_entry(&ifaces, adapter_name)
+		if mem_err == nil && inserted {
+			key_ptr^, mem_err = strings.clone(adapter_name)
+			iface.adapter_name = key_ptr^
+		}
+		if mem_err != nil {
+			return {}, .Unable_To_Enumerate_Network_Interfaces
 		}
-		iface := ifaces[adapter_name]

 		address: Address
 		netmask: Netmask

-		if ifaddr.address != nil {
-			switch int(ifaddr.address.family) {
-			case os.AF_INET, os.AF_INET6:
-				address = _sockaddr_basic_to_endpoint(ifaddr.address).address
+		if ifaddr.addr != nil {
+			#partial switch ifaddr.addr.sa_family {
+			case .INET, .INET6:
+				address = _sockaddr_basic_to_endpoint(ifaddr.addr).address
 			}
 		}

 		if ifaddr.netmask != nil {
-			switch int(ifaddr.netmask.family) {
-			case os.AF_INET, os.AF_INET6:
+			#partial switch ifaddr.netmask.sa_family {
+			case .INET, .INET6:
 				netmask = Netmask(_sockaddr_basic_to_endpoint(ifaddr.netmask).address)
 			}
 		}

-		if ifaddr.broadcast_or_dest != nil && .BROADCAST in ifaddr.flags {
-			switch int(ifaddr.broadcast_or_dest.family) {
-			case os.AF_INET, os.AF_INET6:
-				broadcast := _sockaddr_basic_to_endpoint(ifaddr.broadcast_or_dest).address
+		if ifaddr.dstaddr != nil && .BROADCAST in ifaddr.flags {
+			#partial switch ifaddr.dstaddr.sa_family {
+			case .INET, .INET6:
+				broadcast := _sockaddr_basic_to_endpoint(ifaddr.dstaddr).address
 				append(&iface.multicast, broadcast)
 			}
 		}
@@ -105,18 +102,51 @@ _enumerate_interfaces :: proc(allocator := context.allocator) -> (interfaces: []
 		iface.link.state = state
 	}

-	/*
-		Free the OS structures.
-	*/
-	os._freeifaddrs(head)
-
-	/*
-		Turn the map into a slice to return.
-	*/
-	_interfaces := make([dynamic]Network_Interface, 0, allocator)
+	interfaces = make([]Network_Interface, len(ifaces))
+	i: int
 	for _, iface in ifaces {
-		append(&_interfaces, iface^)
-		free(iface)
+		interfaces[i] = iface
+		i += 1
 	}
-	return _interfaces[:], {}
+	return interfaces, nil
+}
+
+@(private)
+IF_Flag :: enum u32 {
+	UP,
+	BROADCAST,
+	DEBUG,
+	LOOPBACK,
+	POINTTOPOINT,
+	NOTRAILERS,
+	RUNNING,
+	NOARP,
+	PROMISC,
+	ALLMULTI,
+	OACTIVE,
+	SIMPLEX,
+	LINK0,
+	LINK1,
+	LINK2,
+	MULTICAST,
+}
+
+@(private)
+IF_Flags :: bit_set[IF_Flag; u32]
+
+@(private)
+ifaddrs :: struct {
+	next:    ^ifaddrs,
+	name:    cstring,
+	flags:   IF_Flags,
+	addr:    ^posix.sockaddr,
+	netmask: ^posix.sockaddr,
+	dstaddr: ^posix.sockaddr,
+	data:    rawptr,
+}
+
+@(private)
+foreign lib {
+	getifaddrs  :: proc(ifap: ^^ifaddrs) -> posix.result ---
+	freeifaddrs :: proc(ifp: ^ifaddrs) ---
 }
@@ -21,44 +21,45 @@ package net
 */

 import "core:c"
-import "core:os"
 import "core:sys/posix"
 import "core:time"

 Socket_Option :: enum c.int {
-	Broadcast                 = c.int(os.SO_BROADCAST),
-	Reuse_Address             = c.int(os.SO_REUSEADDR),
-	Keep_Alive                = c.int(os.SO_KEEPALIVE),
-	Out_Of_Bounds_Data_Inline = c.int(os.SO_OOBINLINE),
-	TCP_Nodelay               = c.int(os.TCP_NODELAY),
-	Linger                    = c.int(os.SO_LINGER),
-	Receive_Buffer_Size       = c.int(os.SO_RCVBUF),
-	Send_Buffer_Size          = c.int(os.SO_SNDBUF),
-	Receive_Timeout           = c.int(os.SO_RCVTIMEO),
-	Send_Timeout              = c.int(os.SO_SNDTIMEO),
+	Broadcast                 = c.int(posix.Sock_Option.BROADCAST),
+	Reuse_Address             = c.int(posix.Sock_Option.REUSEADDR),
+	Keep_Alive                = c.int(posix.Sock_Option.KEEPALIVE),
+	Out_Of_Bounds_Data_Inline = c.int(posix.Sock_Option.OOBINLINE),
+	TCP_Nodelay               = c.int(posix.TCP_NODELAY),
+	Linger                    = c.int(posix.Sock_Option.LINGER),
+	Receive_Buffer_Size       = c.int(posix.Sock_Option.RCVBUF),
+	Send_Buffer_Size          = c.int(posix.Sock_Option.SNDBUF),
+	Receive_Timeout           = c.int(posix.Sock_Option.RCVTIMEO),
+	Send_Timeout              = c.int(posix.Sock_Option.SNDTIMEO),
 }

@(private)
 _create_socket :: proc(family: Address_Family, protocol: Socket_Protocol) -> (socket: Any_Socket, err: Network_Error) {
-	c_type, c_protocol, c_family: int
+	c_type: posix.Sock
+	c_protocol: posix.Protocol
+	c_family: posix.AF

 	switch family {
-	case .IP4:  c_family = os.AF_INET
-	case .IP6:  c_family = os.AF_INET6
+	case .IP4:  c_family = .INET
+	case .IP6:  c_family = .INET6
 	case:
 		unreachable()
 	}

 	switch protocol {
-	case .TCP:  c_type = os.SOCK_STREAM; c_protocol = os.IPPROTO_TCP
-	case .UDP:  c_type = os.SOCK_DGRAM;  c_protocol = os.IPPROTO_UDP
+	case .TCP:  c_type = .STREAM; c_protocol = .TCP
+	case .UDP:  c_type = .DGRAM;  c_protocol = .UDP
 	case:
 		unreachable()
 	}

-	sock, sock_err := os.socket(c_family, c_type, c_protocol)
-	if sock_err != nil {
-		err = Create_Socket_Error(os.is_platform_error(sock_err) or_else -1)
+	sock := posix.socket(c_family, c_type, c_protocol)
+	if sock < 0 {
+		err = Create_Socket_Error(posix.errno())
 		return
 	}

@@ -86,10 +87,10 @@ _dial_tcp_from_endpoint :: proc(endpoint: Endpoint, options := default_tcp_optio
 	_ = set_option(skt, .Reuse_Address, true)

 	sockaddr := _endpoint_to_sockaddr(endpoint)
-	res := os.connect(os.Socket(skt), (^os.SOCKADDR)(&sockaddr), i32(sockaddr.len))
-	if res != nil {
+	if posix.connect(posix.FD(skt), (^posix.sockaddr)(&sockaddr), posix.socklen_t(sockaddr.ss_len)) != .OK {
+		errno := posix.errno()
 		close(skt)
-		return {}, Dial_Error(os.is_platform_error(res) or_else -1)
+		return {}, Dial_Error(errno)
 	}

 	return
@@ -102,14 +103,15 @@ MAX_PRIVILEGED_PORT :: 1023
 _bind :: proc(skt: Any_Socket, ep: Endpoint) -> (err: Network_Error) {
 	sockaddr := _endpoint_to_sockaddr(ep)
 	s := any_socket_to_socket(skt)
-	res := os.bind(os.Socket(s), (^os.SOCKADDR)(&sockaddr), i32(sockaddr.len))
-	if res != nil {
-		if res == os.EACCES && ep.port <= MAX_PRIVILEGED_PORT {
+	if posix.bind(posix.FD(s), (^posix.sockaddr)(&sockaddr), posix.socklen_t(sockaddr.ss_len)) != .OK {
+		errno := posix.errno()
+		if errno == .EACCES && ep.port <= MAX_PRIVILEGED_PORT {
 			err = .Privileged_Port_Without_Root
 		} else {
-			err = Bind_Error(os.is_platform_error(res) or_else -1)
+			err = Bind_Error(errno)
 		}
 	}
+
 	return
 }

@@ -131,9 +133,8 @@ _listen_tcp :: proc(interface_endpoint: Endpoint, backlog := 1000) -> (skt: TCP_

 	bind(sock, interface_endpoint) or_return

-	res := os.listen(os.Socket(skt), backlog)
-	if res != nil {
-		err = Listen_Error(os.is_platform_error(res) or_else -1)
+	if posix.listen(posix.FD(skt), i32(backlog)) != .OK {
+		err = Listen_Error(posix.errno())
 		return
 	}

@@ -144,34 +145,34 @@ _listen_tcp :: proc(interface_endpoint: Endpoint, backlog := 1000) -> (skt: TCP_
 _bound_endpoint :: proc(sock: Any_Socket) -> (ep: Endpoint, err: Network_Error) {
 	addr: posix.sockaddr_storage
 	addr_len := posix.socklen_t(size_of(addr))
-	res := posix.getsockname(posix.FD(any_socket_to_socket(sock)), (^posix.sockaddr)(&addr), &addr_len)
-	if res != .OK {
+	if posix.getsockname(posix.FD(any_socket_to_socket(sock)), (^posix.sockaddr)(&addr), &addr_len) != .OK {
 		err = Listen_Error(posix.errno())
 		return
 	}
-	ep = _sockaddr_to_endpoint((^os.SOCKADDR_STORAGE_LH)(&addr))
+
+	ep = _sockaddr_to_endpoint(&addr)
 	return
 }

@(private)
 _accept_tcp :: proc(sock: TCP_Socket, options := default_tcp_options) -> (client: TCP_Socket, source: Endpoint, err: Network_Error) {
-	sockaddr: os.SOCKADDR_STORAGE_LH
-	sockaddrlen := c.int(size_of(sockaddr))
-
-	client_sock, client_sock_err := os.accept(os.Socket(sock), cast(^os.SOCKADDR) &sockaddr, &sockaddrlen)
-	if client_sock_err != nil {
-		err = Accept_Error(os.is_platform_error(client_sock_err) or_else -1)
+	addr: posix.sockaddr_storage
+	addr_len := posix.socklen_t(size_of(addr))
+	client_sock := posix.accept(posix.FD(sock), (^posix.sockaddr)(&addr), &addr_len)
+	if client_sock < 0 {
+		err = Accept_Error(posix.errno())
 		return
 	}
+
 	client = TCP_Socket(client_sock)
-	source = _sockaddr_to_endpoint(&sockaddr)
+	source = _sockaddr_to_endpoint(&addr)
 	return
 }

@(private)
 _close :: proc(skt: Any_Socket) {
 	s := any_socket_to_socket(skt)
-	os.close(os.Handle(os.Socket(s)))
+	posix.close(posix.FD(s))
 }

@(private)
@@ -179,11 +180,13 @@ _recv_tcp :: proc(skt: TCP_Socket, buf: []byte) -> (bytes_read: int, err: Networ
 	if len(buf) <= 0 {
 		return
 	}
-	res, res_err := os.recv(os.Socket(skt), buf, 0)
-	if res_err != nil {
-		err = TCP_Recv_Error(os.is_platform_error(res_err) or_else -1)
+
+	res := posix.recv(posix.FD(skt), raw_data(buf), len(buf), {})
+	if res < 0 {
+		err = TCP_Recv_Error(posix.errno())
 		return
 	}
+
 	return int(res), nil
 }

@@ -193,11 +196,11 @@ _recv_udp :: proc(skt: UDP_Socket, buf: []byte) -> (bytes_read: int, remote_endp
 		return
 	}

-	from: os.SOCKADDR_STORAGE_LH
-	fromsize := c.int(size_of(from))
-	res, res_err := os.recvfrom(os.Socket(skt), buf, 0, cast(^os.SOCKADDR) &from, &fromsize)
-	if res_err != nil {
-		err = UDP_Recv_Error(os.is_platform_error(res_err) or_else -1)
+	from: posix.sockaddr_storage
+	fromsize := posix.socklen_t(size_of(from))
+	res := posix.recvfrom(posix.FD(skt), raw_data(buf), len(buf), {}, (^posix.sockaddr)(&from), &fromsize)
+	if res < 0 {
+		err = UDP_Recv_Error(posix.errno())
 		return
 	}

@@ -211,15 +214,19 @@ _send_tcp :: proc(skt: TCP_Socket, buf: []byte) -> (bytes_written: int, err: Net
 	for bytes_written < len(buf) {
 		limit := min(int(max(i32)), len(buf) - bytes_written)
 		remaining := buf[bytes_written:][:limit]
-		res, res_err := os.send(os.Socket(skt), remaining, os.MSG_NOSIGNAL)
-		if res_err == os.EPIPE {
-			// EPIPE arises if the socket has been closed remotely.
-			err = TCP_Send_Error.Connection_Closed
-			return
-		} else if res_err != nil {
-			err = TCP_Send_Error(os.is_platform_error(res_err) or_else -1)
+		res := posix.send(posix.FD(skt), raw_data(remaining), len(remaining), {.NOSIGNAL})
+		if res < 0 {
+			errno := posix.errno()
+			if errno == .EPIPE {
+				// EPIPE arises if the socket has been closed remotely.
+				err = TCP_Send_Error.Connection_Closed
+				return
+			}
+
+			err = TCP_Send_Error(errno)
 			return
 		}
+
 		bytes_written += int(res)
 	}
 	return
@@ -231,15 +238,19 @@ _send_udp :: proc(skt: UDP_Socket, buf: []byte, to: Endpoint) -> (bytes_written:
 	for bytes_written < len(buf) {
 		limit := min(1<<31, len(buf) - bytes_written)
 		remaining := buf[bytes_written:][:limit]
-		res, res_err := os.sendto(os.Socket(skt), remaining, os.MSG_NOSIGNAL, cast(^os.SOCKADDR)&toaddr, i32(toaddr.len))
-		if res_err == os.EPIPE {
-			// EPIPE arises if the socket has been closed remotely.
-			err = UDP_Send_Error.Not_Socket
-			return
-		} else if res_err != nil {
-			err = UDP_Send_Error(os.is_platform_error(res_err) or_else -1)
+		res := posix.sendto(posix.FD(skt), raw_data(remaining), len(remaining), {.NOSIGNAL}, (^posix.sockaddr)(&toaddr), posix.socklen_t(toaddr.ss_len))
+		if res < 0 {
+			errno := posix.errno()
+			if errno == .EPIPE {
+				// EPIPE arises if the socket has been closed remotely.
+				err = UDP_Send_Error.Not_Socket
+				return
+			}
+
+			err = UDP_Send_Error(errno)
 			return
 		}
+
 		bytes_written += int(res)
 	}
 	return
@@ -248,26 +259,25 @@ _send_udp :: proc(skt: UDP_Socket, buf: []byte, to: Endpoint) -> (bytes_written:
@(private)
 _shutdown :: proc(skt: Any_Socket, manner: Shutdown_Manner) -> (err: Network_Error) {
 	s := any_socket_to_socket(skt)
-	res := os.shutdown(os.Socket(s), int(manner))
-	if res != nil {
-		return Shutdown_Error(os.is_platform_error(res) or_else -1)
+	if posix.shutdown(posix.FD(s), posix.Shut(manner)) != .OK {
+		err = Shutdown_Error(posix.errno())
 	}
 	return
 }

@(private)
 _set_option :: proc(s: Any_Socket, option: Socket_Option, value: any, loc := #caller_location) -> Network_Error {
-	level := os.SOL_SOCKET if option != .TCP_Nodelay else os.IPPROTO_TCP
+	level := posix.SOL_SOCKET if option != .TCP_Nodelay else posix.IPPROTO_TCP

 	// NOTE(tetra, 2022-02-15): On Linux, you cannot merely give a single byte for a bool;
 	//  it _has_ to be a b32.
 	//  I haven't tested if you can give more than that.
 	bool_value: b32
-	int_value: i32
-	timeval_value: os.Timeval
+	int_value: posix.socklen_t
+	timeval_value: posix.timeval

 	ptr: rawptr
-	len: os.socklen_t
+	len: posix.socklen_t

 	switch option {
 	case
@@ -302,8 +312,8 @@ _set_option :: proc(s: Any_Socket, option: Socket_Option, value: any, loc := #ca
 			t := value.(time.Duration) or_else panic("set_option() value must be a time.Duration here", loc)

 			micros := i64(time.duration_microseconds(t))
-			timeval_value.microseconds = int(micros % 1e6)
-			timeval_value.seconds = (micros - i64(timeval_value.microseconds)) / 1e6
+			timeval_value.tv_usec = posix.suseconds_t(micros % 1e6)
+			timeval_value.tv_sec  = posix.time_t(micros - i64(timeval_value.tv_usec)) / 1e6

 			ptr = &timeval_value
 			len = size_of(timeval_value)
@@ -312,12 +322,12 @@ _set_option :: proc(s: Any_Socket, option: Socket_Option, value: any, loc := #ca
 		.Send_Buffer_Size:
 			// TODO: check for out of range values and return .Value_Out_Of_Range?
 			switch i in value {
-			case i8, u8:   i2 := i; int_value = os.socklen_t((^u8)(&i2)^)
-			case i16, u16: i2 := i; int_value = os.socklen_t((^u16)(&i2)^)
-			case i32, u32: i2 := i; int_value = os.socklen_t((^u32)(&i2)^)
-			case i64, u64: i2 := i; int_value = os.socklen_t((^u64)(&i2)^)
-			case i128, u128: i2 := i; int_value = os.socklen_t((^u128)(&i2)^)
-			case int, uint: i2 := i; int_value = os.socklen_t((^uint)(&i2)^)
+			case i8, u8:   i2 := i; int_value = posix.socklen_t((^u8)(&i2)^)
+			case i16, u16: i2 := i; int_value = posix.socklen_t((^u16)(&i2)^)
+			case i32, u32: i2 := i; int_value = posix.socklen_t((^u32)(&i2)^)
+			case i64, u64: i2 := i; int_value = posix.socklen_t((^u64)(&i2)^)
+			case i128, u128: i2 := i; int_value = posix.socklen_t((^u128)(&i2)^)
+			case int, uint: i2 := i; int_value = posix.socklen_t((^uint)(&i2)^)
 			case:
 				panic("set_option() value must be an integer here", loc)
 			}
@@ -326,9 +336,8 @@ _set_option :: proc(s: Any_Socket, option: Socket_Option, value: any, loc := #ca
 	}

 	skt := any_socket_to_socket(s)
-	res := os.setsockopt(os.Socket(skt), int(level), int(option), ptr, len)
-	if res != nil {
-		return Socket_Option_Error(os.is_platform_error(res) or_else -1)
+	if posix.setsockopt(posix.FD(skt), i32(level), posix.Sock_Option(option), ptr, len) != .OK {
+		return Socket_Option_Error(posix.errno())
 	}

 	return nil
@@ -338,42 +347,42 @@ _set_option :: proc(s: Any_Socket, option: Socket_Option, value: any, loc := #ca
 _set_blocking :: proc(socket: Any_Socket, should_block: bool) -> (err: Network_Error) {
 	socket := any_socket_to_socket(socket)

-	flags, getfl_err := os.fcntl(int(socket), os.F_GETFL, 0)
-	if getfl_err != nil {
-		return Set_Blocking_Error(os.is_platform_error(getfl_err) or_else -1)
+	flags_ := posix.fcntl(posix.FD(socket), .GETFL, 0)
+	if flags_ < 0 {
+		return Set_Blocking_Error(posix.errno())
 	}
+	flags := transmute(posix.O_Flags)flags_

 	if should_block {
-		flags &~= int(os.O_NONBLOCK)
+		flags -= {.NONBLOCK}
 	} else {
-		flags |= int(os.O_NONBLOCK)
+		flags += {.NONBLOCK}
 	}

-	_, setfl_err := os.fcntl(int(socket), os.F_SETFL, flags)
-	if setfl_err != nil {
-		return Set_Blocking_Error(os.is_platform_error(setfl_err) or_else -1)
+	if posix.fcntl(posix.FD(socket), .SETFL, flags) < 0 {
+		return Set_Blocking_Error(posix.errno())
 	}

 	return nil
 }

@private
-_endpoint_to_sockaddr :: proc(ep: Endpoint) -> (sockaddr: os.SOCKADDR_STORAGE_LH) {
+_endpoint_to_sockaddr :: proc(ep: Endpoint) -> (sockaddr: posix.sockaddr_storage) {
 	switch a in ep.address {
 	case IP4_Address:
-		(^os.sockaddr_in)(&sockaddr)^ = os.sockaddr_in {
+		(^posix.sockaddr_in)(&sockaddr)^ = posix.sockaddr_in {
 			sin_port = u16be(ep.port),
-			sin_addr = transmute(os.in_addr) a,
-			sin_family = u8(os.AF_INET),
-			sin_len = size_of(os.sockaddr_in),
+			sin_addr = transmute(posix.in_addr)a,
+			sin_family = .INET,
+			sin_len = size_of(posix.sockaddr_in),
 		}
 		return
 	case IP6_Address:
-		(^os.sockaddr_in6)(&sockaddr)^ = os.sockaddr_in6 {
+		(^posix.sockaddr_in6)(&sockaddr)^ = posix.sockaddr_in6 {
 			sin6_port = u16be(ep.port),
-			sin6_addr = transmute(os.in6_addr) a,
-			sin6_family = u8(os.AF_INET6),
-			sin6_len = size_of(os.sockaddr_in6),
+			sin6_addr = transmute(posix.in6_addr)a,
+			sin6_family = .INET6,
+			sin6_len = size_of(posix.sockaddr_in6),
 		}
 		return
 	}
@@ -381,21 +390,21 @@ _endpoint_to_sockaddr :: proc(ep: Endpoint) -> (sockaddr: os.SOCKADDR_STORAGE_LH
 }

@private
-_sockaddr_to_endpoint :: proc(native_addr: ^os.SOCKADDR_STORAGE_LH) -> (ep: Endpoint) {
-	switch native_addr.family {
-	case u8(os.AF_INET):
-		addr := cast(^os.sockaddr_in) native_addr
+_sockaddr_to_endpoint :: proc(native_addr: ^posix.sockaddr_storage) -> (ep: Endpoint) {
+	#partial switch native_addr.ss_family {
+	case .INET:
+		addr := cast(^posix.sockaddr_in)native_addr
 		port := int(addr.sin_port)
 		ep = Endpoint {
-			address = IP4_Address(transmute([4]byte) addr.sin_addr),
-			port = port,
+			address = IP4_Address(transmute([4]byte)addr.sin_addr),
+			port    = port,
 		}
-	case u8(os.AF_INET6):
-		addr := cast(^os.sockaddr_in6) native_addr
+	case .INET6:
+		addr := cast(^posix.sockaddr_in6)native_addr
 		port := int(addr.sin6_port)
 		ep = Endpoint {
-			address = IP6_Address(transmute([8]u16be) addr.sin6_addr),
-			port = port,
+			address = IP6_Address(transmute([8]u16be)addr.sin6_addr),
+			port    = port,
 		}
 	case:
 		panic("native_addr is neither IP4 or IP6 address")
@@ -404,21 +413,21 @@ _sockaddr_to_endpoint :: proc(native_addr: ^os.SOCKADDR_STORAGE_LH) -> (ep: Endp
 }

@(private)
-_sockaddr_basic_to_endpoint :: proc(native_addr: ^os.SOCKADDR) -> (ep: Endpoint) {
-	switch u16(native_addr.family) {
-	case u16(os.AF_INET):
-		addr := cast(^os.sockaddr_in) native_addr
+_sockaddr_basic_to_endpoint :: proc(native_addr: ^posix.sockaddr) -> (ep: Endpoint) {
+	#partial switch native_addr.sa_family {
+	case .INET:
+		addr := cast(^posix.sockaddr_in)native_addr
 		port := int(addr.sin_port)
 		ep = Endpoint {
-			address = IP4_Address(transmute([4]byte) addr.sin_addr),
-			port = port,
+			address = IP4_Address(transmute([4]byte)addr.sin_addr),
+			port    = port,
 		}
-	case u16(os.AF_INET6):
-		addr := cast(^os.sockaddr_in6) native_addr
+	case .INET6:
+		addr := cast(^posix.sockaddr_in6)native_addr
 		port := int(addr.sin6_port)
 		ep = Endpoint {
-			address = IP6_Address(transmute([8]u16be) addr.sin6_addr),
-			port = port,
+			address = IP6_Address(transmute([8]u16be)addr.sin6_addr),
+			port    = port,
 		}
 	case:
 		panic("native_addr is neither IP4 or IP6 address")
@@ -5,9 +5,10 @@ import "core:strings"

@(require_results)
 read_dir :: proc(fd: Handle, n: int, allocator := context.allocator) -> (fi: []File_Info, err: Error) {
-	dupfd := _dup(fd) or_return
+	context.allocator = allocator

-	dirp := _fdopendir(dupfd) or_return
+	dupfd := _dup(fd) or_return
+	dirp  := _fdopendir(dupfd) or_return
 	defer _closedir(dirp)

 	dirpath := absolute_path_from_handle(dupfd) or_return
@@ -2,12 +2,18 @@ package os2

 import "base:runtime"

-import "core:path/filepath"
+import "core:strings"

 Path_Separator        :: _Path_Separator        // OS-Specific
 Path_Separator_String :: _Path_Separator_String // OS-Specific
 Path_List_Separator   :: _Path_List_Separator   // OS-Specific

+#assert(_Path_Separator <= rune(0x7F), "The system-specific path separator rune is expected to be within the 7-bit ASCII character set.")
+
+/*
+Return true if `c` is a character used to separate paths into directory and
+file hierarchies on the current system.
+*/
@(require_results)
 is_path_separator :: proc(c: byte) -> bool {
 	return _is_path_separator(c)
@@ -15,22 +21,42 @@ is_path_separator :: proc(c: byte) -> bool {

 mkdir :: make_directory

+/*
+Make a new directory.
+
+If `path` is relative, it will be relative to the process's current working directory.
+*/
 make_directory :: proc(name: string, perm: int = 0o755) -> Error {
 	return _mkdir(name, perm)
 }

 mkdir_all :: make_directory_all

+/*
+Make a new directory, creating new intervening directories when needed.
+
+If `path` is relative, it will be relative to the process's current working directory.
+*/
 make_directory_all :: proc(path: string, perm: int = 0o755) -> Error {
 	return _mkdir_all(path, perm)
 }

+/*
+Delete `path` and all files and directories inside of `path` if it is a directory.
+
+If `path` is relative, it will be relative to the process's current working directory.
+*/
 remove_all :: proc(path: string) -> Error {
 	return _remove_all(path)
 }

 getwd :: get_working_directory

+/*
+Get the working directory of the current process.
+
+*Allocates Using Provided Allocator*
+*/
@(require_results)
 get_working_directory :: proc(allocator: runtime.Allocator) -> (dir: string, err: Error) {
 	return _get_working_directory(allocator)
@@ -38,16 +64,399 @@ get_working_directory :: proc(allocator: runtime.Allocator) -> (dir: string, err

 setwd :: set_working_directory

+/*
+Change the working directory of the current process.
+
+*Allocates Using Provided Allocator*
+*/
 set_working_directory :: proc(dir: string) -> (err: Error) {
 	return _set_working_directory(dir)
 }

+/*
+Get the path for the currently running executable.
+
+*Allocates Using Provided Allocator*
+*/
+@(require_results)
 get_executable_path :: proc(allocator: runtime.Allocator) -> (path: string, err: Error) {
 	return _get_executable_path(allocator)
 }

+/*
+Get the directory for the currently running executable.
+
+*Allocates Using Provided Allocator*
+*/
+@(require_results)
 get_executable_directory :: proc(allocator: runtime.Allocator) -> (path: string, err: Error) {
 	path = _get_executable_path(allocator) or_return
-	path, _ = filepath.split(path)
+	path, _ = split_path(path)
 	return
 }
+
+/*
+Compare two paths for exactness without normalization.
+
+This procedure takes into account case-sensitivity on differing systems.
+*/
+@(require_results)
+are_paths_identical :: proc(a, b: string) -> (identical: bool) {
+	return _are_paths_identical(a, b)
+}
+
+/*
+Normalize a path.
+
+*Allocates Using Provided Allocator*
+
+This will remove duplicate separators and unneeded references to the current or
+parent directory.
+*/
+@(require_results)
+clean_path :: proc(path: string, allocator: runtime.Allocator) -> (cleaned: string, err: Error) {
+	if path == "" || path == "." {
+		return strings.clone(".", allocator)
+	}
+
+	TEMP_ALLOCATOR_GUARD()
+
+	// The extra byte is to simplify appending path elements by letting the
+	// loop to end each with a separator. We'll trim the last one when we're done.
+	buffer := make([]u8, len(path) + 1, temp_allocator()) or_return
+
+	// This is the only point where Windows and POSIX differ, as Windows has
+	// alphabet-based volumes for root paths.
+	rooted, start := _clean_path_handle_start(path, buffer)
+
+	head, buffer_i := start, start
+	for i, j := start, start; i <= len(path); i += 1 {
+		if i == len(path) || _is_path_separator(path[i]) {
+			elem := path[j:i]
+			j = i + 1
+
+			switch elem {
+			case "", ".":
+				// Skip duplicate path separators and current directory references.
+			case "..":
+				if !rooted && buffer_i == head {
+					// Only allow accessing further parent directories when the path is relative.
+					buffer[buffer_i] = '.'
+					buffer[buffer_i+1] = '.'
+					buffer[buffer_i+2] = _Path_Separator
+					buffer_i += 3
+					head = buffer_i
+				} else {
+					// Roll back to the last separator or the head of the buffer.
+					back_to := head
+					// `buffer_i` will be equal to 1 + the last set byte, so
+					// skipping two bytes avoids the final separator we just
+					// added.
+					for k := buffer_i-2; k >= head; k -= 1 {
+						if _is_path_separator(buffer[k]) {
+							back_to = k + 1
+							break
+						}
+					}
+					buffer_i = back_to
+				}
+			case:
+				// Copy the path element verbatim and add a separator.
+				copy(buffer[buffer_i:], elem)
+				buffer_i += len(elem)
+				buffer[buffer_i] = _Path_Separator
+				buffer_i += 1
+			}
+		}
+	}
+
+	// Trim the final separator.
+	// NOTE: No need to check if the last byte is a separator, as we always add it.
+	if buffer_i > start {
+		buffer_i -= 1
+	}
+
+	if buffer_i == 0 {
+		return strings.clone(".", allocator)
+	}
+
+	compact := make([]u8, buffer_i, allocator) or_return
+	copy(compact, buffer) // NOTE(bill): buffer[:buffer_i] is redundant here
+	return string(compact), nil
+}
+
+/*
+Return true if `path` is an absolute path as opposed to a relative one.
+*/
+@(require_results)
+is_absolute_path :: proc(path: string) -> bool {
+	return _is_absolute_path(path)
+}
+
+/*
+Get the absolute path to `path` with respect to the process's current directory.
+
+*Allocates Using Provided Allocator*
+*/
+@(require_results)
+get_absolute_path :: proc(path: string, allocator: runtime.Allocator) -> (absolute_path: string, err: Error) {
+	return _get_absolute_path(path, allocator)
+}
+
+/*
+Get the relative path needed to change directories from `base` to `target`.
+
+*Allocates Using Provided Allocator*
+
+The result is such that `join_path(base, get_relative_path(base, target))` is equivalent to `target`.
+
+NOTE: This procedure expects both `base` and `target` to be normalized first,
+which can be done by calling `clean_path` on them if needed.
+
+This procedure will return an `Invalid_Path` error if `base` begins with a
+reference to the parent directory (`".."`). Use `get_working_directory` with
+`join_path` to construct absolute paths for both arguments instead.
+*/
+@(require_results)
+get_relative_path :: proc(base, target: string, allocator: runtime.Allocator) -> (path: string, err: Error) {
+	if _are_paths_identical(base, target) {
+		return strings.clone(".", allocator)
+	}
+	if base == "." {
+		return strings.clone(target, allocator)
+	}
+
+	// This is the first point where Windows and POSIX differ, as Windows has
+	// alphabet-based volumes for root paths.
+	if !_get_relative_path_handle_start(base, target) {
+		return "", .Invalid_Path
+	}
+	if strings.has_prefix(base, "..") && (len(base) == 2 || _is_path_separator(base[2])) {
+		// We could do the work for the user of getting absolute paths for both
+		// arguments, but that could make something costly (repeatedly
+		// normalizing paths) convenient, when it would be better for the user
+		// to store already-finalized paths and operate on those instead.
+		return "", .Invalid_Path
+	}
+
+	// This is the other point where Windows and POSIX differ, as Windows is
+	// case-insensitive.
+	common := _get_common_path_len(base, target)
+
+	// Get the result of splitting `base` and `target` on _Path_Separator,
+	// comparing them up to their most common elements, then count how many
+	// unshared parts are in the split `base`.
+	seps := 0
+	size := 0
+	if len(base)-common > 0 {
+		seps = 1
+		size = 2
+	}
+	// This range skips separators on the ends of the string.
+	for i in common+1..<len(base)-1 {
+		if _is_path_separator(base[i]) {
+			seps += 1
+			size += 3
+		}
+	}
+
+	// Handle the rest of the size calculations.
+	trailing := target[common:]
+	if len(trailing) > 0 {
+		// Account for leading separators on the target after cutting the common part.
+		// (i.e. base == `/home`, target == `/home/a`)
+		if _is_path_separator(trailing[0]) {
+			trailing = trailing[1:]
+		}
+		size += len(trailing)
+		if seps > 0 {
+			size += 1
+		}
+	}
+	if trailing == "." {
+		trailing = ""
+		size -= 2
+	}
+
+	// Build the string.
+	buf := make([]u8, size, allocator) or_return
+	n := 0
+	if seps > 0 {
+		buf[0] = '.'
+		buf[1] = '.'
+		n = 2
+	}
+	for _ in 1..<seps {
+		buf[n] = _Path_Separator
+		buf[n+1] = '.'
+		buf[n+2] = '.'
+		n += 3
+	}
+	if len(trailing) > 0 {
+		if seps > 0 {
+			buf[n] = _Path_Separator
+			n += 1
+		}
+		copy(buf[n:], trailing)
+	}
+
+	path = string(buf)
+
+	return
+}
+
+/*
+Split a path into a directory hierarchy and a filename.
+
+For example, `split_path("/home/foo/bar.tar.gz")` will return `"/home/foo"` and `"bar.tar.gz"`.
+*/
+@(require_results)
+split_path :: proc(path: string) -> (dir, filename: string) {
+	return _split_path(path)
+}
+
+/*
+Join all `elems` with the system's path separator and normalize the result.
+
+*Allocates Using Provided Allocator*
+
+For example, `join_path({"/home", "foo", "bar.txt"})` will result in `"/home/foo/bar.txt"`.
+*/
+@(require_results)
+join_path :: proc(elems: []string, allocator: runtime.Allocator) -> (joined: string, err: Error) {
+	for e, i in elems {
+		if e != "" {
+			TEMP_ALLOCATOR_GUARD()
+			p := strings.join(elems[i:], Path_Separator_String, temp_allocator()) or_return
+			return clean_path(p, allocator)
+		}
+	}
+	return "", nil
+}
+
+/*
+Split a filename from its extension.
+
+This procedure splits on the last separator.
+
+If the filename begins with a separator, such as `".readme.txt"`, the separator
+will be included in the filename, resulting in `".readme"` and `"txt"`.
+
+For example, `split_filename("foo.tar.gz")` will return `"foo.tar"` and `"gz"`.
+*/
+@(require_results)
+split_filename :: proc(filename: string) -> (base, ext: string) {
+	i := strings.last_index_byte(filename, '.')
+	if i <= 0 {
+		return filename, ""
+	}
+	return filename[:i], filename[i+1:]
+}
+
+/*
+Split a filename from its extension.
+
+This procedure splits on the first separator.
+
+If the filename begins with a separator, such as `".readme.txt.gz"`, the separator
+will be included in the filename, resulting in `".readme"` and `"txt.gz"`.
+
+For example, `split_filename_all("foo.tar.gz")` will return `"foo"` and `"tar.gz"`.
+*/
+@(require_results)
+split_filename_all :: proc(filename: string) -> (base, ext: string) {
+	i := strings.index_byte(filename, '.')
+	if i == 0 {
+		j := strings.index_byte(filename[1:], '.')
+		if j != -1 {
+			j += 1
+		}
+		i = j
+	}
+	if i == -1 {
+		return filename, ""
+	}
+	return filename[:i], filename[i+1:]
+}
+
+/*
+Join `base` and `ext` with the system's filename extension separator.
+
+*Allocates Using Provided Allocator*
+
+For example, `join_filename("foo", "tar.gz")` will result in `"foo.tar.gz"`.
+*/
+@(require_results)
+join_filename :: proc(base: string, ext: string, allocator: runtime.Allocator) -> (joined: string, err: Error) {
+	if len(base) == 0 {
+		return strings.clone(ext, allocator)
+	} else if len(ext) == 0 {
+		return strings.clone(base, allocator)
+	}
+
+	buf := make([]u8, len(base) + 1 + len(ext), allocator) or_return
+	copy(buf, base)
+	buf[len(base)] = '.'
+	copy(buf[1+len(base):], ext)
+
+	return string(buf), nil
+}
+
+/*
+Split a string that is separated by a system-specific separator, typically used
+for environment variables specifying multiple directories.
+
+*Allocates Using Provided Allocator*
+
+For example, there is the "PATH" environment variable on POSIX systems which
+this procedure can split into separate entries.
+*/
+@(require_results)
+split_path_list :: proc(path: string, allocator: runtime.Allocator) -> (list: []string, err: Error) {
+	if path == "" {
+		return nil, nil
+	}
+
+	start: int
+	quote: bool
+
+	start, quote = 0, false
+	count := 0
+
+	for i := 0; i < len(path); i += 1 {
+		c := path[i]
+		switch {
+		case c == '"':
+			quote = !quote
+		case c == Path_List_Separator && !quote:
+			count += 1
+		}
+	}
+
+	start, quote = 0, false
+	list = make([]string, count + 1, allocator) or_return
+	index := 0
+	for i := 0; i < len(path); i += 1 {
+		c := path[i]
+		switch {
+		case c == '"':
+			quote = !quote
+		case c == Path_List_Separator && !quote:
+			list[index] = path[start:i]
+			index += 1
+			start = i + 1
+		}
+	}
+	assert(index == count)
+	list[index] = path[start:]
+
+	for s0, i in list {
+		s, new := strings.replace_all(s0, `"`, ``, allocator)
+		if !new {
+			s = strings.clone(s, allocator) or_return
+		}
+		list[i] = s
+	}
+
+	return list, nil
+}
@@ -14,7 +14,7 @@ _Path_List_Separator   :: ':'
 _OPENDIR_FLAGS : linux.Open_Flags : {.NONBLOCK, .DIRECTORY, .LARGEFILE, .CLOEXEC}

 _is_path_separator :: proc(c: byte) -> bool {
-	return c == '/'
+	return c == _Path_Separator
 }

 _mkdir :: proc(path: string, perm: int) -> Error {
@@ -3,7 +3,6 @@
 package os2

 import "base:runtime"
-import "core:path/filepath"

 import "core:sys/posix"

@@ -35,11 +34,11 @@ _mkdir_all :: proc(path: string, perm: int) -> Error {
 		return .Exist
 	}

-	clean_path := filepath.clean(path, temp_allocator())
+	clean_path := clean_path(path, temp_allocator()) or_return
 	return internal_mkdir_all(clean_path, perm)

 	internal_mkdir_all :: proc(path: string, perm: int) -> Error {
-		dir, file := filepath.split(path)
+		dir, file := split_path(path)
 		if file != path && dir != "/" {
 			if len(dir) > 1 && dir[len(dir) - 1] == '/' {
 				dir = dir[:len(dir) - 1]
@@ -0,0 +1,78 @@
+#+private
+#+build linux, darwin, netbsd, freebsd, openbsd, wasi
+package os2
+
+// This implementation is for all systems that have POSIX-compliant filesystem paths.
+
+import "base:runtime"
+import "core:strings"
+import "core:sys/posix"
+
+_are_paths_identical :: proc(a, b: string) -> (identical: bool) {
+	return a == b
+}
+
+_clean_path_handle_start :: proc(path: string, buffer: []u8) -> (rooted: bool, start: int) {
+	// Preserve rooted paths.
+	if _is_path_separator(path[0]) {
+		rooted = true
+		buffer[0] = _Path_Separator
+		start = 1
+	}
+	return
+}
+
+_is_absolute_path :: proc(path: string) -> bool {
+	return len(path) > 0 && _is_path_separator(path[0])
+}
+
+_get_absolute_path :: proc(path: string, allocator: runtime.Allocator) -> (absolute_path: string, err: Error) {
+	rel := path
+	if rel == "" {
+		rel = "."
+	}
+	TEMP_ALLOCATOR_GUARD()
+	rel_cstr := strings.clone_to_cstring(rel, temp_allocator())
+	path_ptr := posix.realpath(rel_cstr, nil)
+	if path_ptr == nil {
+		return "", Platform_Error(posix.errno())
+	}
+	defer posix.free(path_ptr)
+
+	path_str := strings.clone(string(path_ptr), allocator)
+	return path_str, nil
+}
+
+_get_relative_path_handle_start :: proc(base, target: string) -> bool {
+	base_rooted   := len(base)   > 0 && _is_path_separator(base[0])
+	target_rooted := len(target) > 0 && _is_path_separator(target[0])
+	return base_rooted == target_rooted
+}
+
+_get_common_path_len :: proc(base, target: string) -> int {
+	i := 0
+	end := min(len(base), len(target))
+	for j in 0..=end {
+		if j == end || _is_path_separator(base[j]) {
+			if base[i:j] == target[i:j] {
+				i = j
+			} else {
+				break
+			}
+		}
+	}
+	return i
+}
+
+_split_path :: proc(path: string) -> (dir, file: string) {
+	i := len(path) - 1
+	for i >= 0 && !_is_path_separator(path[i]) {
+		i -= 1
+	}
+	if i == 0 {
+		return path[:i+1], path[i+1:]
+	} else if i > 0 {
+		return path[:i], path[i+1:]
+	}
+	return "", path
+}
@@ -3,7 +3,6 @@ package os2

 import "base:runtime"

-import "core:path/filepath"
 import "core:sync"
 import "core:sys/wasm/wasi"

@@ -35,11 +34,11 @@ _mkdir_all :: proc(path: string, perm: int) -> Error {
 		return .Exist
 	}

-	clean_path := filepath.clean(path, temp_allocator())
+	clean_path := clean_path(path, temp_allocator())
 	return internal_mkdir_all(clean_path)

 	internal_mkdir_all :: proc(path: string) -> Error {
-		dir, file := filepath.split(path)
+		dir, file := split_path(path)
 		if file != path && dir != "/" {
 			if len(dir) > 1 && dir[len(dir) - 1] == '/' {
 				dir = dir[:len(dir) - 1]
@@ -1,8 +1,9 @@
 #+private
 package os2

-import win32 "core:sys/windows"
 import "base:runtime"
+import "core:strings"
+import win32 "core:sys/windows"

 _Path_Separator        :: '\\'
 _Path_Separator_String :: "\\"
@@ -217,7 +218,7 @@ _fix_long_path_internal :: proc(path: string) -> string {
 		return path
 	}

-	if !_is_abs(path) { // relative path
+	if !_is_absolute_path(path) { // relative path
 		return path
 	}

@@ -257,3 +258,93 @@ _fix_long_path_internal :: proc(path: string) -> string {

 	return string(path_buf[:w])
 }
+
+_are_paths_identical :: strings.equal_fold
+
+_clean_path_handle_start :: proc(path: string, buffer: []u8) -> (rooted: bool, start: int) {
+	// Preserve rooted paths.
+	start = _volume_name_len(path)
+	if start > 0 {
+		rooted = true
+		if len(path) > start && _is_path_separator(path[start]) {
+			// Take `C:` to `C:\`.
+			start += 1
+		}
+		copy(buffer, path[:start])
+	}
+	return
+}
+
+_is_absolute_path :: proc(path: string) -> bool {
+	if _is_reserved_name(path) {
+		return true
+	}
+	l := _volume_name_len(path)
+	if l == 0 {
+		return false
+	}
+
+	path := path
+	path = path[l:]
+	if path == "" {
+		return false
+	}
+	return _is_path_separator(path[0])
+}
+
+_get_absolute_path :: proc(path: string, allocator: runtime.Allocator) -> (absolute_path: string, err: Error) {
+	rel := path
+	if rel == "" {
+		rel = "."
+	}
+	TEMP_ALLOCATOR_GUARD()
+	rel_utf16 := win32.utf8_to_utf16(rel, temp_allocator())
+	n := win32.GetFullPathNameW(raw_data(rel_utf16), 0, nil, nil)
+	if n == 0 {
+		return "", Platform_Error(win32.GetLastError())
+	}
+
+	buf := make([]u16, n, temp_allocator()) or_return
+	n = win32.GetFullPathNameW(raw_data(rel_utf16), u32(n), raw_data(buf), nil)
+	if n == 0 {
+		return "", Platform_Error(win32.GetLastError())
+	}
+
+	return win32.utf16_to_utf8(buf, allocator)
+}
+
+_get_relative_path_handle_start :: proc(base, target: string) -> bool {
+	base_root   := base[:_volume_name_len(base)]
+	target_root := target[:_volume_name_len(target)]
+	return strings.equal_fold(base_root, target_root)
+}
+
+_get_common_path_len :: proc(base, target: string) -> int {
+	i := 0
+	end := min(len(base), len(target))
+	for j in 0..=end {
+		if j == end || _is_path_separator(base[j]) {
+			if strings.equal_fold(base[i:j], target[i:j]) {
+				i = j
+			} else {
+				break
+			}
+		}
+	}
+	return i
+}
+
+_split_path :: proc(path: string) -> (dir, file: string) {
+	vol_len := _volume_name_len(path)
+
+	i := len(path) - 1
+	for i >= vol_len && !_is_path_separator(path[i]) {
+		i -= 1
+	}
+	if i == vol_len {
+		return path[:i+1], path[i+1:]
+	} else if i > vol_len {
+		return path[:i], path[i+1:]
+	}
+	return "", path
+}
@@ -10,7 +10,6 @@ import "core:slice"
 import "core:strings"
 import "core:strconv"
 import "core:sys/linux"
-import "core:path/filepath"

 PIDFD_UNASSIGNED  :: ~uintptr(0)

@@ -205,7 +204,7 @@ _process_info_by_pid :: proc(pid: int, selection: Process_Info_Fields, allocator
 				info.executable_path = strings.clone(cmdline[:terminator], allocator) or_return
 				info.fields += {.Executable_Path}
 			} else if cwd_err == nil {
-				info.executable_path = filepath.join({ cwd, cmdline[:terminator] }, allocator) or_return
+				info.executable_path = join_path({ cwd, cmdline[:terminator] }, allocator) or_return
 				info.fields += {.Executable_Path}
 			} else {
 				break cmdline_if
@@ -407,7 +406,7 @@ _process_start :: proc(desc: Process_Desc) -> (process: Process, err: Error) {
 	executable_name := desc.command[0]
 	if strings.index_byte(executable_name, '/') < 0 {
 		path_env := get_env("PATH", temp_allocator())
-		path_dirs := filepath.split_list(path_env, temp_allocator()) or_return
+		path_dirs := split_path_list(path_env, temp_allocator()) or_return

 		exe_builder := strings.builder_make(temp_allocator()) or_return

@@ -6,7 +6,6 @@ import "base:runtime"

 import "core:time"
 import "core:strings"
-import "core:path/filepath"

 import kq "core:sys/kqueue"
 import    "core:sys/posix"
@@ -62,7 +61,7 @@ _process_start :: proc(desc: Process_Desc) -> (process: Process, err: Error) {
 	exe_name    := desc.command[0]
 	if strings.index_byte(exe_name, '/') < 0 {
 		path_env  := get_env("PATH", temp_allocator())
-		path_dirs := filepath.split_list(path_env, temp_allocator())
+		path_dirs := split_path_list(path_env, temp_allocator()) or_return

 		found: bool
 		for dir in path_dirs {
@@ -1,7 +1,6 @@
 package os2

 import "base:runtime"
-import "core:path/filepath"
 import "core:strings"
 import "core:time"

@@ -25,7 +24,7 @@ File_Info :: struct {
 file_info_clone :: proc(fi: File_Info, allocator: runtime.Allocator) -> (cloned: File_Info, err: runtime.Allocator_Error) {
 	cloned = fi
 	cloned.fullpath = strings.clone(fi.fullpath, allocator) or_return
-	cloned.name = filepath.base(cloned.fullpath)
+	_, cloned.name = split_path(cloned.fullpath)
 	return
 }

@@ -4,7 +4,6 @@ package os2
 import "core:time"
 import "base:runtime"
 import "core:sys/linux"
-import "core:path/filepath"

 _fstat :: proc(f: ^File, allocator: runtime.Allocator) -> (File_Info, Error) {
 	impl := (^File_Impl)(f.impl)
@@ -42,7 +41,7 @@ _fstat_internal :: proc(fd: linux.Fd, allocator: runtime.Allocator) -> (fi: File
 		creation_time     = time.Time{i64(s.ctime.time_sec) * i64(time.Second) + i64(s.ctime.time_nsec)}, // regular stat does not provide this
 	}
 	fi.creation_time = fi.modification_time
-	fi.name = filepath.base(fi.fullpath)
+	_, fi.name = split_path(fi.fullpath)
 	return
 }

@@ -4,13 +4,12 @@ package os2

 import "base:runtime"

-import "core:path/filepath"
 import "core:sys/posix"
 import "core:time"

 internal_stat :: proc(stat: posix.stat_t, fullpath: string) -> (fi: File_Info) {
 	fi.fullpath = fullpath
-	fi.name = filepath.base(fi.fullpath)
+	_, fi.name = split_path(fi.fullpath)

 	fi.inode = u128(stat.st_ino)
 	fi.size = i64(stat.st_size)
@@ -104,7 +103,7 @@ _lstat :: proc(name: string, allocator: runtime.Allocator) -> (fi: File_Info, er
 	// NOTE: This might not be correct when given "/symlink/foo.txt",
 	// you would want that to resolve "/symlink", but not resolve "foo.txt".

-	fullpath := filepath.clean(name, temp_allocator())
+	fullpath := clean_path(name, temp_allocator()) or_return
 	assert(len(fullpath) > 0)
 	switch {
 	case fullpath[0] == '/':
@@ -3,13 +3,12 @@ package os2

 import "base:runtime"

-import "core:path/filepath"
 import "core:sys/wasm/wasi"
 import "core:time"

 internal_stat :: proc(stat: wasi.filestat_t, fullpath: string) -> (fi: File_Info) {
 	fi.fullpath = fullpath
-	fi.name = filepath.base(fi.fullpath)
+	_, fi.name = split_path(fi.fullpath)

 	fi.inode = u128(stat.ino)
 	fi.size  = i64(stat.size)
@@ -315,57 +315,37 @@ _is_UNC :: proc(path: string) -> bool {
 }

 _volume_name_len :: proc(path: string) -> int {
-	if ODIN_OS == .Windows {
-		if len(path) < 2 {
-			return 0
-		}
-		c := path[0]
-		if path[1] == ':' {
-			switch c {
-			case 'a'..='z', 'A'..='Z':
-				return 2
-			}
+	if len(path) < 2 {
+		return 0
+	}
+	c := path[0]
+	if path[1] == ':' {
+		switch c {
+		case 'a'..='z', 'A'..='Z':
+			return 2
 		}
+	}

-		// URL: https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx
-		if l := len(path); l >= 5 && _is_path_separator(path[0]) && _is_path_separator(path[1]) &&
-			!_is_path_separator(path[2]) && path[2] != '.' {
-			for n := 3; n < l-1; n += 1 {
-				if _is_path_separator(path[n]) {
-					n += 1
-					if !_is_path_separator(path[n]) {
-						if path[n] == '.' {
-							break
-						}
+	// URL: https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx
+	if l := len(path); l >= 5 && _is_path_separator(path[0]) && _is_path_separator(path[1]) &&
+		!_is_path_separator(path[2]) && path[2] != '.' {
+		for n := 3; n < l-1; n += 1 {
+			if _is_path_separator(path[n]) {
+				n += 1
+				if !_is_path_separator(path[n]) {
+					if path[n] == '.' {
+						break
 					}
-					for ; n < l; n += 1 {
-						if _is_path_separator(path[n]) {
-							break
-						}
-					}
-					return n
 				}
-				break
+				for ; n < l; n += 1 {
+					if _is_path_separator(path[n]) {
+						break
+					}
+				}
+				return n
 			}
+			break
 		}
 	}
 	return 0
 }
-
-_is_abs :: proc(path: string) -> bool {
-	if _is_reserved_name(path) {
-		return true
-	}
-	l := _volume_name_len(path)
-	if l == 0 {
-		return false
-	}
-
-	path := path
-	path = path[l:]
-	if path == "" {
-		return false
-	}
-	return is_path_separator(path[0])
-}
-
@@ -343,7 +343,7 @@ AT_REMOVEDIR        :: 0x08

@(default_calling_convention="c")
 foreign libc {
-	@(link_name="__error")        __error              :: proc() -> ^c.int ---
+	@(link_name="__errno")        __error              :: proc() -> ^c.int ---

 	@(link_name="fork")           _unix_fork           :: proc() -> pid_t ---
 	@(link_name="getthrid")       _unix_getthrid       :: proc() -> int ---
@@ -21,7 +21,7 @@ _mm_abs_epi32 :: #force_inline proc "c" (a: __m128i) -> __m128i {
 _mm_shuffle_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
 	return transmute(__m128i)pshufb128(transmute(u8x16)a, transmute(u8x16)b)
 }
-@(require_results, enable_target_feature="ssse3")
+@(require_results, enable_target_feature="sse2,ssse3")
 _mm_alignr_epi8 :: #force_inline proc "c" (a, b: __m128i, $IMM8: u32) -> __m128i {
 	shift :: IMM8

@@ -21,6 +21,7 @@ SYS_close      : uintptr : 6
 SYS_getpid     : uintptr : 20
 SYS_recvfrom   : uintptr : 29
 SYS_accept     : uintptr : 30
+SYS_getpeername: uintptr : 31
 SYS_getsockname: uintptr : 32
 SYS_fcntl      : uintptr : 92
 SYS_fsync      : uintptr : 95
@@ -202,24 +203,36 @@ accept_nil :: proc "contextless" (s: Fd) -> (Fd, Errno) {

 accept :: proc { accept_T, accept_nil }

+getsockname_or_peername :: proc "contextless" (s: Fd, sockaddr: ^$T, is_peer: bool) -> Errno {
+    // sockaddr must contain a valid pointer, or this will segfault because
+    // we're telling the syscall that there's memory available to write to.
+    addrlen: socklen_t = size_of(T)
+
+    result, ok := intrinsics.syscall_bsd(
+        is_peer ? SYS_getpeername : SYS_getsockname,
+        cast(uintptr)s,
+        cast(uintptr)sockaddr,
+        cast(uintptr)&addrlen)
+
+    if !ok {
+            return cast(Errno)result
+    }
+
+    return nil
+}
+
+// Get name of connected peer
+//
+// The getpeername() system call appeared in 4.2BSD.
+getpeername :: proc "contextless" (s: Fd, sockaddr: ^$T) -> Errno {
+	return getsockname_or_peername(s, sockaddr, true)
+}
+
 // Get socket name.
 //
 // The getsockname() system call appeared in 4.2BSD.
 getsockname :: proc "contextless" (s: Fd, sockaddr: ^$T) -> Errno {
-	// sockaddr must contain a valid pointer, or this will segfault because
-	// we're telling the syscall that there's memory available to write to.
-	addrlen: socklen_t = size_of(T)
-
-	result, ok := intrinsics.syscall_bsd(SYS_getsockname,
-		cast(uintptr)s,
-		cast(uintptr)sockaddr,
-		cast(uintptr)&addrlen)
-
-	if !ok {
-		return cast(Errno)result
-	}
-
-	return nil
+	return getsockname_or_peername(s, sockaddr, false)
 }

 // Synchronize changes to a file.
@@ -23,6 +23,7 @@ CPU_Feature :: enum u64 {
 	popcnt,    // Hamming weight instruction POPCNT.
 	rdrand,    // RDRAND instruction (on-chip random number generator)
 	rdseed,    // RDSEED instruction (on-chip random number generator)
+	sha,       // SHA Extensions (SHA-1, SHA-224, SHA-256)
 	sse2,      // Streaming SIMD extension 2 (always available on amd64)
 	sse3,      // Streaming SIMD extension 3
 	ssse3,     // Supplemental streaming SIMD extension 3
@@ -115,6 +116,7 @@ init_cpu_features :: proc "c" () {

 	_, ebx7, ecx7, edx7 := cpuid(7, 0)
 	try_set(&set, .bmi1, 3, ebx7)
+	try_set(&set, .sha, 29, ebx7)
 	if os_supports_avx {
 		try_set(&set, .avx2, 5, ebx7)
 	}
@@ -1,3 +1,4 @@
+#+build linux
 #+no-instrumentation
 package linux

@@ -1325,18 +1325,20 @@ function odinSetupDefaultImports(wasmMemoryInterface, consoleElement, memory) {
 		} else if (!line.includes("\n")) {
 			currentLine[isError] = currentLine[isError].concat(line);
 		} else {
-			let lines = line.split("\n");
+			let lines = line.trimEnd().split("\n");
 			let printLast = lines.length > 1 && line.endsWith("\n");
 			println(currentLine[isError].concat(lines[0]));
 			currentLine[isError] = "";
 			for (let i = 1; i < lines.length-1; i++) {
 				println(lines[i]);
 			}
-			let last = lines[lines.length-1];
-			if (printLast) {
-				println(last);
-			} else {
-				currentLine[isError] = last;
+			if (lines.length > 1) {
+				let last = lines[lines.length-1];
+				if (printLast) {
+					println(last);
+				} else {
+					currentLine[isError] = last;
+				}
 			}
 		}

@@ -26,12 +26,14 @@ import topological_sort "core:container/topological_sort"

 import crypto           "core:crypto"
 import aead             "core:crypto/aead"
+import aegis            "core:crypto/aegis"
 import aes              "core:crypto/aes"
 import blake2b          "core:crypto/blake2b"
 import blake2s          "core:crypto/blake2s"
 import chacha20         "core:crypto/chacha20"
 import chacha20poly1305 "core:crypto/chacha20poly1305"
 import crypto_hash      "core:crypto/hash"
+import deoxysii         "core:crypto/deoxysii"
 import ed25519          "core:crypto/ed25519"
 import hkdf             "core:crypto/hkdf"
 import hmac             "core:crypto/hmac"
@@ -48,6 +50,7 @@ import shake            "core:crypto/shake"
 import sm3              "core:crypto/sm3"
 import tuplehash        "core:crypto/tuplehash"
 import x25519           "core:crypto/x25519"
+import x448             "core:crypto/x448"

 import pe               "core:debug/pe"
 import trace            "core:debug/trace"
@@ -169,11 +172,13 @@ _ :: topological_sort
 _ :: crypto
 _ :: crypto_hash
 _ :: aead
+_ :: aegis
 _ :: aes
 _ :: blake2b
 _ :: blake2s
 _ :: chacha20
 _ :: chacha20poly1305
+_ :: deoxysii
 _ :: ed25519
 _ :: hmac
 _ :: hkdf
@@ -190,6 +195,7 @@ _ :: shake
 _ :: sm3
 _ :: tuplehash
 _ :: x25519
+_ :: x448
 _ :: pe
 _ :: trace
 _ :: dynlib
@@ -4,9 +4,9 @@ pkgs.mkShell {
  nativeBuildInputs = with pkgs; [
    git
    which
-    clang_17
-    llvmPackages_17.llvm
-    llvmPackages_17.bintools
+    clang_20
+    llvmPackages_20.llvm
+    llvmPackages_20.bintools
  ];
  shellHook="CXX=clang++";
 }
@@ -171,6 +171,7 @@ struct TargetMetrics {
 enum Subtarget : u32 {
 	Subtarget_Default,
 	Subtarget_iOS,
+	Subtarget_Android,

 	Subtarget_COUNT,
 };
@@ -178,6 +179,7 @@ enum Subtarget : u32 {
 gb_global String subtarget_strings[Subtarget_COUNT] = {
 	str_lit(""),
 	str_lit("ios"),
+	str_lit("android"),
 };


@@ -204,20 +206,25 @@ enum BuildModeKind {
 	BuildMode_COUNT,
 };

-enum CommandKind : u32 {
+enum CommandKind : u64 {
 	Command_run             = 1<<0,
 	Command_build           = 1<<1,
-	Command_check           = 1<<3,
-	Command_doc             = 1<<5,
-	Command_version         = 1<<6,
-	Command_test            = 1<<7,
+	Command_check           = 1<<2,
+	Command_doc             = 1<<3,
+	Command_version         = 1<<4,
+	Command_test            = 1<<5,
 	
-	Command_strip_semicolon = 1<<8,
-	Command_bug_report      = 1<<9,
+	Command_strip_semicolon = 1<<6,
+	Command_bug_report      = 1<<7,
+
+	Command_bundle_android = 1<<8,
+	Command_bundle_macos   = 1<<9,
+	Command_bundle_ios     = 1<<10,
+	Command_bundle_orca    = 1<<11,

 	Command__does_check = Command_run|Command_build|Command_check|Command_doc|Command_test|Command_strip_semicolon,
 	Command__does_build = Command_run|Command_build|Command_test,
-	Command_all = ~(u32)0,
+	Command_all = ~(CommandKind)0,
 };

 gb_global char const *odin_command_strings[32] = {
@@ -228,6 +235,11 @@ gb_global char const *odin_command_strings[32] = {
 	"version",
 	"test",
 	"strip-semicolon",
+	"",
+	"bundle android",
+	"bundle macos",
+	"bundle ios",
+	"bundle orca",
 };


@@ -527,6 +539,22 @@ struct BuildContext {

 	String minimum_os_version_string;
 	bool   minimum_os_version_string_given;
+
+
+	int    ODIN_ANDROID_API_LEVEL;
+
+	String ODIN_ANDROID_SDK;
+
+	String ODIN_ANDROID_NDK;
+	String ODIN_ANDROID_NDK_TOOLCHAIN;
+	String ODIN_ANDROID_NDK_TOOLCHAIN_LIB;
+	String ODIN_ANDROID_NDK_TOOLCHAIN_LIB_LEVEL;
+	String ODIN_ANDROID_NDK_TOOLCHAIN_SYSROOT;
+
+	String ODIN_ANDROID_JAR_SIGNER;
+	String android_keystore;
+	String android_keystore_alias;
+	String android_manifest;
 };

 gb_global BuildContext build_context = {0};
@@ -946,6 +974,14 @@ gb_internal bool is_arch_x86(void) {
 gb_global String const WIN32_SEPARATOR_STRING = {cast(u8 *)"\\", 1};
 gb_global String const NIX_SEPARATOR_STRING   = {cast(u8 *)"/",  1};

+gb_global String const SEPARATOR_STRING =
+#if defined(GB_SYSTEM_WINDOWS)
+	WIN32_SEPARATOR_STRING;
+#else
+	NIX_SEPARATOR_STRING;
+#endif
+
+
 gb_global String const WASM_MODULE_NAME_SEPARATOR = str_lit("..");

 gb_internal String internal_odin_root_dir(void);
@@ -1461,6 +1497,103 @@ gb_internal bool has_ansi_terminal_colours(void) {
 	return build_context.has_ansi_terminal_colours && !json_errors();
 }

+gb_internal void init_android_values(bool with_sdk) {
+	auto *bc = &build_context;
+	{ // Android SDK/API Level
+		String default_level = str_lit("34");
+		if (!bc->minimum_os_version_string_given) {
+			bc->minimum_os_version_string = default_level;
+		}
+		BigInt level = {};
+		bool success = false;
+		big_int_from_string(&level, bc->minimum_os_version_string, &success);
+		if (!success) {
+			gb_printf_err("Warning: Invalid -minimum-os-version:%.*s for -subtarget:Android, defaulting to %.*s\n", LIT(bc->minimum_os_version_string), LIT(default_level));
+			bc->minimum_os_version_string = default_level;
+			big_int_from_string(&level, bc->minimum_os_version_string, &success);
+			GB_ASSERT(success);
+		}
+
+		i64 new_level = big_int_to_i64(&level);
+
+		if (new_level >= 21) {
+			bc->ODIN_ANDROID_API_LEVEL = cast(int)new_level;
+		} else {
+			gb_printf_err("Warning: Invalid -minimum-os-version:%.*s for -subtarget:Android, defaulting to %.*s\n", LIT(bc->minimum_os_version_string), LIT(default_level));
+			bc->ODIN_ANDROID_API_LEVEL = atoi(cast(char const *)default_level.text);
+		}
+	}
+	bc->ODIN_ANDROID_NDK           = normalize_path(permanent_allocator(), make_string_c(gb_get_env("ODIN_ANDROID_NDK", permanent_allocator())), NIX_SEPARATOR_STRING);
+	bc->ODIN_ANDROID_NDK_TOOLCHAIN = normalize_path(permanent_allocator(), make_string_c(gb_get_env("ODIN_ANDROID_NDK_TOOLCHAIN", permanent_allocator())), NIX_SEPARATOR_STRING);
+	bc->ODIN_ANDROID_SDK           = normalize_path(permanent_allocator(), make_string_c(gb_get_env("ODIN_ANDROID_SDK", permanent_allocator())), NIX_SEPARATOR_STRING);
+
+	#if defined(GB_SYSTEM_WINDOWS)
+		if (bc->ODIN_ANDROID_SDK.len == 0) {
+			bc->ODIN_ANDROID_SDK = normalize_path(permanent_allocator(),
+				path_to_fullpath(permanent_allocator(), str_lit("%LocalAppData%/Android/Sdk"), nullptr),
+				NIX_SEPARATOR_STRING);
+		}
+	#endif
+
+	if (bc->ODIN_ANDROID_NDK.len != 0 && bc->ODIN_ANDROID_NDK_TOOLCHAIN.len == 0) {
+		String arch = str_lit("x86_64");
+		#if defined (GB_CPU_ARM)
+			// TODO(bill): this is a complete guess
+			arch = str_lit("aarch64");
+		#endif
+		#if defined(GB_SYSTEM_WINDOWS)
+			bc->ODIN_ANDROID_NDK_TOOLCHAIN = concatenate4_strings(temporary_allocator(), bc->ODIN_ANDROID_NDK, str_lit("toolchains/llvm/prebuilt/"), str_lit("windows-"), arch);
+		#elif defined(GB_SYSTEM_OSX)
+			// TODO(bill): is this name even correct?
+			bc->ODIN_ANDROID_NDK_TOOLCHAIN = concatenate4_strings(temporary_allocator(), bc->ODIN_ANDROID_NDK, str_lit("toolchains/llvm/prebuilt/"), str_lit("darwin-"), arch);
+		#elif defined(GB_SYSTEM_LINUX)
+			bc->ODIN_ANDROID_NDK_TOOLCHAIN = concatenate4_strings(temporary_allocator(), bc->ODIN_ANDROID_NDK, str_lit("toolchains/llvm/prebuilt/"), str_lit("linux-"), arch);
+		#endif
+
+		bc->ODIN_ANDROID_NDK_TOOLCHAIN = normalize_path(permanent_allocator(), bc->ODIN_ANDROID_NDK_TOOLCHAIN, NIX_SEPARATOR_STRING);
+	}
+
+	if (bc->ODIN_ANDROID_NDK.len == 0 && !with_sdk)  {
+		gb_printf_err("Error: ODIN_ANDROID_NDK not set");
+		gb_exit(1);
+
+	}
+
+	if (bc->ODIN_ANDROID_NDK_TOOLCHAIN.len == 0 && !with_sdk)  {
+		gb_printf_err("Error: ODIN_ANDROID_NDK not set");
+		gb_exit(1);
+	}
+
+	bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB = concatenate_strings(permanent_allocator(), bc->ODIN_ANDROID_NDK_TOOLCHAIN, str_lit("sysroot/usr/lib/aarch64-linux-android/"));
+
+	char buf[32] = {};
+	gb_snprintf(buf, gb_size_of(buf), "%d/", bc->ODIN_ANDROID_API_LEVEL);
+	bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB_LEVEL = concatenate_strings(permanent_allocator(), bc->ODIN_ANDROID_NDK_TOOLCHAIN_LIB, make_string_c(buf));
+
+	bc->ODIN_ANDROID_NDK_TOOLCHAIN_SYSROOT = concatenate_strings(permanent_allocator(), bc->ODIN_ANDROID_NDK_TOOLCHAIN, str_lit("sysroot/"));
+
+
+	bc->ODIN_ANDROID_JAR_SIGNER = normalize_path(permanent_allocator(), make_string_c(gb_get_env("ODIN_ANDROID_JAR_SIGNER", permanent_allocator())), NIX_SEPARATOR_STRING);
+	if (with_sdk) {
+		if (bc->ODIN_ANDROID_SDK.len == 0)  {
+			gb_printf_err("Error: ODIN_ANDROID_SDK not set, which is required for -build-mode:executable for -subtarget:android");
+			gb_exit(1);
+		}
+		if (bc->ODIN_ANDROID_JAR_SIGNER.len == 0) {
+			gb_printf_err("Error: ODIN_ANDROID_JAR_SIGNER not set, which is required for -build-mode:executable for -subtarget:android");
+			gb_exit(1);
+		}
+		if (bc->android_keystore.len == 0) {
+			gb_printf_err("Error: -android-keystore:<string> has not been set\n");
+			gb_exit(1);
+		}
+		if (bc->android_keystore_alias.len == 0) {
+			gb_printf_err("Error: -android-keystore_alias:<string> has not been set\n");
+			gb_exit(1);
+		}
+	}
+}
+
 gb_internal bool has_asm_extension(String const &path) {
 	String ext = path_extension(path);
 	if (ext == ".asm") {
@@ -1652,6 +1785,15 @@ gb_internal void init_build_context(TargetMetrics *cross_target, Subtarget subta
 		default:
 			GB_PANIC("Unknown architecture for darwin");
 		}
+	} else if (metrics->os == TargetOs_linux && subtarget == Subtarget_Android) {
+		switch (metrics->arch) {
+		case TargetArch_arm64:
+			bc->metrics.target_triplet = str_lit("aarch64-none-linux-android");
+			bc->reloc_mode = RelocMode_PIC;
+			break;
+		default:
+			GB_PANIC("Unknown architecture for -subtarget:android");
+		}
 	}

 	if (bc->metrics.os == TargetOs_windows) {
@@ -1706,6 +1848,8 @@ gb_internal void init_build_context(TargetMetrics *cross_target, Subtarget subta
 		if (subtarget == Subtarget_Default) {
 			bc->metrics.target_triplet = concatenate_strings(permanent_allocator(), bc->metrics.target_triplet, bc->minimum_os_version_string);
 		}
+	} else if (selected_subtarget == Subtarget_Android) {
+		init_android_values(bc->build_mode == BuildMode_Executable);
 	}

 	if (!bc->custom_optimization_level) {
@@ -1749,6 +1893,30 @@ gb_internal void init_build_context(TargetMetrics *cross_target, Subtarget subta
 	if (bc->metrics.os == TargetOs_freestanding) {
 		bc->ODIN_DEFAULT_TO_NIL_ALLOCATOR = !bc->ODIN_DEFAULT_TO_PANIC_ALLOCATOR;
 	}
+
+	if (subtarget == Subtarget_Android) {
+		switch (build_context.build_mode) {
+		case BuildMode_DynamicLibrary:
+		case BuildMode_Object:
+		case BuildMode_Assembly:
+		case BuildMode_LLVM_IR:
+			break;
+		default:
+		case BuildMode_Executable:
+		case BuildMode_StaticLibrary:
+			if ((build_context.command_kind & Command__does_build) != 0) {
+				gb_printf_err("Unsupported -build-mode for -subtarget:android\n");
+				gb_printf_err("\tCurrently only supporting: \n");
+				// gb_printf_err("\t\texe\n");
+				gb_printf_err("\t\tshared\n");
+				gb_printf_err("\t\tobject\n");
+				gb_printf_err("\t\tassembly\n");
+				gb_printf_err("\t\tllvm-ir\n");
+				gb_exit(1);
+			}
+			break;
+		}
+	}
 }

 #if defined(GB_SYSTEM_WINDOWS)
@@ -1947,7 +2115,10 @@ gb_internal bool init_build_paths(String init_filename) {
 		output_extension = make_string(nullptr, 0);
 		String const single_file_extension = str_lit(".odin");

-		if (build_context.metrics.os == TargetOs_windows) {
+		if (selected_subtarget == Subtarget_Android) {
+			// NOTE(bill): It's always shared!
+			output_extension = STR_LIT("so");
+		} else if (build_context.metrics.os == TargetOs_windows) {
 			output_extension = STR_LIT("exe");
 		} else if (build_context.cross_compiling && selected_target_metrics->metrics == &target_essence_amd64) {
 			// Do nothing: we don't want the .bin extension
@@ -0,0 +1,209 @@
+i32 bundle_android(String init_directory);
+
+i32 bundle(String init_directory) {
+	switch (build_context.command_kind) {
+	case Command_bundle_android:
+		return bundle_android(init_directory);
+	}
+	gb_printf_err("Unknown odin package <platform>\n");
+	return 1;
+}
+
+
+i32 bundle_android(String original_init_directory) {
+	TEMPORARY_ALLOCATOR_GUARD();
+
+	i32 result = 0;
+	init_android_values(/*with_sdk*/true);
+
+	bool init_directory_ok = false;
+	String init_directory = path_to_fullpath(temporary_allocator(), original_init_directory, &init_directory_ok);
+	if (!init_directory_ok) {
+		gb_printf_err("Error: '%.*s' is not a valid directory", LIT(original_init_directory));
+		return 1;
+	}
+	init_directory = normalize_path(temporary_allocator(), init_directory, NIX_SEPARATOR_STRING);
+
+	int const ODIN_ANDROID_API_LEVEL = build_context.ODIN_ANDROID_API_LEVEL;
+
+	String android_sdk_build_tools = concatenate3_strings(temporary_allocator(),
+		build_context.ODIN_ANDROID_SDK, str_lit("build-tools"), NIX_SEPARATOR_STRING);
+
+	Array<FileInfo> list = {};
+	ReadDirectoryError rd_err = read_directory(android_sdk_build_tools, &list);
+	defer (array_free(&list));
+
+	switch (rd_err) {
+	case ReadDirectory_InvalidPath:
+		gb_printf_err("Invalid path: %.*s\n", LIT(android_sdk_build_tools));
+		return 1;
+	case ReadDirectory_NotExists:
+		gb_printf_err("Path does not exist: %.*s\n", LIT(android_sdk_build_tools));
+		return 1;
+	case ReadDirectory_Permission:
+		gb_printf_err("Unknown error whilst reading path %.*s\n", LIT(android_sdk_build_tools));
+		return 1;
+	case ReadDirectory_NotDir:
+		gb_printf_err("Expected a directory for a package, got a file: %.*s\n", LIT(android_sdk_build_tools));
+		return 1;
+	case ReadDirectory_Empty:
+		gb_printf_err("Empty directory: %.*s\n", LIT(android_sdk_build_tools));
+		return 1;
+	case ReadDirectory_Unknown:
+		gb_printf_err("Unknown error whilst reading path %.*s\n", LIT(android_sdk_build_tools));
+		return 1;
+	}
+
+	auto possible_valid_dirs = array_make<FileInfo>(heap_allocator(), 0, list.count);
+	defer (array_free(&possible_valid_dirs));
+
+
+	for (FileInfo fi : list) if (fi.is_dir) {
+		bool all_numbers = true;
+		for (isize i = 0; i < fi.name.len; i++) {
+			u8 c = fi.name[i];
+			if ('0' <= c && c <= '9') {
+				// true
+			} else if (i == 0) {
+				all_numbers = false;
+			} else if (c == '.') {
+				break;
+			} else {
+				all_numbers = false;
+			}
+		}
+
+		if (all_numbers) {
+			array_add(&possible_valid_dirs, fi);
+		}
+	}
+
+	if (possible_valid_dirs.count == 0) {
+		gb_printf_err("Unable to find any Android SDK/API Level in %.*s\n", LIT(android_sdk_build_tools));
+		return 1;
+	}
+
+	int *dir_numbers = gb_alloc_array(temporary_allocator(), int, possible_valid_dirs.count);
+
+	char buf[1024] = {};
+	for_array(i, possible_valid_dirs) {
+		FileInfo fi = possible_valid_dirs[i];
+		isize n = gb_min(gb_size_of(buf)-1, fi.name.len);
+		memcpy(buf, fi.name.text, n);
+		buf[n] = 0;
+
+		dir_numbers[i] = atoi(buf);
+	}
+
+	isize closest_number_idx = -1;
+	for (isize i = 0; i < possible_valid_dirs.count; i++) {
+		if (dir_numbers[i] >= ODIN_ANDROID_API_LEVEL) {
+			if (closest_number_idx < 0) {
+				closest_number_idx = i;
+			} else if (dir_numbers[i] < dir_numbers[closest_number_idx]) {
+				closest_number_idx = i;
+			}
+		}
+	}
+
+	if (closest_number_idx < 0) {
+		gb_printf_err("Unable to find any Android SDK/API Level in %.*s meeting the minimum API level of %d\n", LIT(android_sdk_build_tools), ODIN_ANDROID_API_LEVEL);
+		return 1;
+	}
+
+	String api_number = possible_valid_dirs[closest_number_idx].name;
+
+	android_sdk_build_tools = concatenate_strings(temporary_allocator(), android_sdk_build_tools, api_number);
+	String android_sdk_platforms = concatenate_strings(temporary_allocator(),
+		build_context.ODIN_ANDROID_SDK,
+		make_string_c(gb_bprintf("platforms/android-%d/", dir_numbers[closest_number_idx]))
+	);
+
+	android_sdk_build_tools = normalize_path(temporary_allocator(), android_sdk_build_tools, NIX_SEPARATOR_STRING);
+	android_sdk_platforms   = normalize_path(temporary_allocator(), android_sdk_platforms,   NIX_SEPARATOR_STRING);
+
+	gbString cmd = gb_string_make(heap_allocator(), "");
+	defer (gb_string_free(cmd));
+
+
+	String current_directory = normalize_path(temporary_allocator(), get_working_directory(temporary_allocator()), NIX_SEPARATOR_STRING);
+	defer (set_working_directory(current_directory));
+
+	if (current_directory.len != 0) {
+		bool ok = set_working_directory(init_directory);
+		if (!ok) {
+			gb_printf_err("Error: Unable to currectly set the current working directory to '%.*s'\n", LIT(init_directory));
+		}
+	}
+
+	String output_filename = str_lit("test");
+	String output_apk = path_remove_extension(output_filename);
+
+	TIME_SECTION("Android aapt");
+	{
+		TEMPORARY_ALLOCATOR_GUARD();
+		gb_string_clear(cmd);
+
+		String manifest = {};
+		if (build_context.android_manifest.len != 0) {
+			manifest = concatenate_strings(temporary_allocator(), current_directory, build_context.android_manifest);
+		} else {
+			manifest = concatenate_strings(temporary_allocator(), init_directory, str_lit("AndroidManifest.xml"));
+		}
+
+		cmd = gb_string_append_length(cmd, android_sdk_build_tools.text, android_sdk_build_tools.len);
+		cmd = gb_string_appendc(cmd, "aapt");
+		cmd = gb_string_appendc(cmd, " package -f");
+		if (manifest.len != 0) {
+			cmd = gb_string_append_fmt(cmd, " -M \"%.*s\"", LIT(manifest));
+		}
+		cmd = gb_string_append_fmt(cmd, " -I \"%.*sandroid.jar\"", LIT(android_sdk_platforms));
+		cmd = gb_string_append_fmt(cmd, " -F \"%.*s.apk-build\"", LIT(output_apk));
+
+		result = system_exec_command_line_app("android-aapt", cmd);
+		if (result) {
+			return result;
+		}
+	}
+
+	TIME_SECTION("Android jarsigner");
+	{
+		TEMPORARY_ALLOCATOR_GUARD();
+		gb_string_clear(cmd);
+
+		cmd = gb_string_append_length(cmd, build_context.ODIN_ANDROID_JAR_SIGNER.text, build_context.ODIN_ANDROID_JAR_SIGNER.len);
+		cmd = gb_string_append_fmt(cmd, " -storepass android");
+		if (build_context.android_keystore.len != 0) {
+			String keystore = concatenate_strings(temporary_allocator(), current_directory, build_context.android_keystore);
+			cmd = gb_string_append_fmt(cmd, " -keystore \"%.*s\"", LIT(keystore));
+		}
+		cmd = gb_string_append_fmt(cmd, " \"%.*s.apk-build\"", LIT(output_apk));
+		if (build_context.android_keystore_alias.len != 0) {
+			String keystore_alias = build_context.android_keystore_alias;
+			cmd = gb_string_append_fmt(cmd, " \"%.*s\"", LIT(keystore_alias));
+		}
+
+		result = system_exec_command_line_app("android-jarsigner", cmd);
+		if (result) {
+			return result;
+		}
+	}
+
+	TIME_SECTION("Android zipalign");
+	{
+		TEMPORARY_ALLOCATOR_GUARD();
+		gb_string_clear(cmd);
+
+		cmd = gb_string_append_length(cmd, android_sdk_build_tools.text, android_sdk_build_tools.len);
+		cmd = gb_string_appendc(cmd, "zipalign");
+		cmd = gb_string_appendc(cmd, " -f 4");
+		cmd = gb_string_append_fmt(cmd, " \"%.*s.apk-build\" \"%.*s.apk\"", LIT(output_apk), LIT(output_apk));
+
+		result = system_exec_command_line_app("android-zipalign", cmd);
+		if (result) {
+			return result;
+		}
+	}
+
+	return 0;
+}
@@ -645,6 +645,13 @@ gb_internal bool check_builtin_simd_operation(CheckerContext *c, Operand *operan
 				break;
 			}

+			if (!are_types_identical(x.type, y.type)) {
+				gbString tx = type_to_string(x.type);
+				gbString ty = type_to_string(y.type);
+				error(call, "Mismatched types to '%.*s', '%s' vs '%s'", LIT(builtin_name), tx, ty);
+				gb_string_free(ty);
+				gb_string_free(tx);
+			}

 			Type *vt = base_type(x.type);
 			GB_ASSERT(vt->kind == Type_SimdVector);
@@ -1675,12 +1682,16 @@ gb_internal bool check_builtin_procedure_directive(CheckerContext *c, Operand *o
 		}
 		if (ce->args.count > 0) {
 			Ast *arg = ce->args[0];
-			Operand o = {};
-			Entity *e = check_ident(c, &o, arg, nullptr, nullptr, true);
-			if (e == nullptr || (e->flags & EntityFlag_Param) == 0) {
-				error(ce->args[0], "'#caller_expression' expected a valid earlier parameter name");
+			if (arg->kind != Ast_Ident) {
+				error(arg, "'#caller_expression' expected an identifier");
+			} else {
+				Operand o = {};
+				Entity *e = check_ident(c, &o, arg, nullptr, nullptr, true);
+				if (e == nullptr || (e->flags & EntityFlag_Param) == 0) {
+					error(arg, "'#caller_expression' expected a valid earlier parameter name");
+				}
+				arg->Ident.entity = e;
 			}
-			arg->Ident.entity = e;
 		}

 		operand->type = t_string;
@@ -628,6 +628,10 @@ gb_internal void check_const_decl(CheckerContext *ctx, Entity *e, Ast *type_expr
 				Operand x = {};
 				x.type = entity->type;
 				x.mode = Addressing_Variable;
+				if (entity->kind == Entity_Constant) {
+					x.mode  = Addressing_Constant;
+					x.value = entity->Constant.value;
+				}
 				if (!check_is_assignable_to(ctx, &x, e->type)) {
 					gbString expr_str = expr_to_string(init);
 					gbString op_type_str = type_to_string(entity->type);
@@ -8979,8 +8979,14 @@ gb_internal ExprKind check_or_else_expr(CheckerContext *c, Operand *o, Ast *node
 		o->expr = node;
 		return Expr_Expr;
 	}
+
+	Type *left_type = nullptr;
+	Type *right_type = nullptr;
+	check_or_else_split_types(c, &x, name, &left_type, &right_type);
+	add_type_and_value(c, arg, x.mode, x.type, x.value);
+
 	bool y_is_diverging = false;
-	check_expr_base(c, &y, default_value, x.type);
+	check_expr_base(c, &y, default_value, left_type);
 	switch (y.mode) {
 	case Addressing_NoValue:
 		if (is_diverging_expr(y.expr)) {
@@ -9005,11 +9011,6 @@ gb_internal ExprKind check_or_else_expr(CheckerContext *c, Operand *o, Ast *node
 		return Expr_Expr;
 	}

-	Type *left_type = nullptr;
-	Type *right_type = nullptr;
-	check_or_else_split_types(c, &x, name, &left_type, &right_type);
-	add_type_and_value(c, arg, x.mode, x.type, x.value);
-
 	if (left_type != nullptr) {
 		if (!y_is_diverging) {
 			check_assignment(c, &y, left_type, name);
@@ -1149,6 +1149,7 @@ gb_internal void init_universal(void) {
 		GlobalEnumValue values[Subtarget_COUNT] = {
 			{"Default", Subtarget_Default},
 			{"iOS",     Subtarget_iOS},
+			{"Android", Subtarget_Android},
 		};

 		auto fields = add_global_enum_type(str_lit("Odin_Platform_Subtarget_Type"), values, gb_count_of(values));
@@ -7,19 +7,15 @@ struct LinkerData {
 	Array<String> output_temp_paths;
 	String   output_base;
 	String   output_name;
-#if defined(GB_SYSTEM_OSX)
-	b8       needs_system_library_linked;
-#endif
+	bool     needs_system_library_linked;
 };

 gb_internal i32 system_exec_command_line_app(char const *name, char const *fmt, ...);
 gb_internal bool system_exec_command_line_app_output(char const *command, gbString *output);

-#if defined(GB_SYSTEM_OSX)
 gb_internal void linker_enable_system_library_linking(LinkerData *ld) {
-	ld->needs_system_library_linked = 1;
+	ld->needs_system_library_linked = true;
 }
-#endif

 gb_internal void linker_data_init(LinkerData *ld, CheckerInfo *info, String const &init_fullpath) {
 	gbAllocator ha = heap_allocator();
@@ -28,9 +24,7 @@ gb_internal void linker_data_init(LinkerData *ld, CheckerInfo *info, String cons
 	array_init(&ld->foreign_libraries,   ha, 0, 1024);
 	ptr_set_init(&ld->foreign_libraries_set, 1024);

-#if defined(GB_SYSTEM_OSX)
-	ld->needs_system_library_linked = 0;
-#endif 
+	ld->needs_system_library_linked = false;

 	if (build_context.out_filepath.len == 0) {
 		ld->output_name = remove_directory_from_path(init_fullpath);
@@ -136,6 +130,9 @@ gb_internal i32 linker_stage(LinkerData *gen) {
 		return result;
 	}

+	bool is_cross_linking = false;
+	bool is_android = false;
+
 	if (build_context.cross_compiling && selected_target_metrics->metrics == &target_essence_amd64) {
 #if defined(GB_SYSTEM_UNIX)
 		result = system_exec_command_line_app("linker", "x86_64-essence-gcc \"%.*s.o\" -o \"%.*s\" %.*s %.*s",
@@ -147,22 +144,29 @@ gb_internal i32 linker_stage(LinkerData *gen) {
 		);
 #endif
 	} else if (build_context.cross_compiling && build_context.different_os) {
-		gb_printf_err("Linking for cross compilation for this platform is not yet supported (%.*s %.*s)\n",
-			LIT(target_os_names[build_context.metrics.os]),
-			LIT(target_arch_names[build_context.metrics.arch])
-		);
-		build_context.keep_object_files = true;
+		switch (selected_subtarget) {
+		case Subtarget_Android:
+			is_cross_linking = true;
+			is_android = true;
+			goto try_cross_linking;
+		default:
+			gb_printf_err("Linking for cross compilation for this platform is not yet supported (%.*s %.*s)\n",
+				LIT(target_os_names[build_context.metrics.os]),
+				LIT(target_arch_names[build_context.metrics.arch])
+			);
+			build_context.keep_object_files = true;
+			break;
+		}
 	} else {
+try_cross_linking:;
+
 	#if defined(GB_SYSTEM_WINDOWS)
-		bool is_windows = true;
+		bool is_windows = build_context.metrics.os == TargetOs_windows;
 	#else
 		bool is_windows = false;
 	#endif
-	#if defined(GB_SYSTEM_OSX)
-		bool is_osx = true;
-	#else
-		bool is_osx = false;
-	#endif
+
+		bool is_osx = build_context.metrics.os == TargetOs_darwin;


 		if (is_windows) {
@@ -414,23 +418,27 @@ gb_internal i32 linker_stage(LinkerData *gen) {
 		} else {
 			timings_start_section(timings, str_lit("ld-link"));

+			int const ODIN_ANDROID_API_LEVEL = build_context.ODIN_ANDROID_API_LEVEL;
+
+			String ODIN_ANDROID_NDK                     = build_context.ODIN_ANDROID_NDK;
+			String ODIN_ANDROID_NDK_TOOLCHAIN           = build_context.ODIN_ANDROID_NDK_TOOLCHAIN;
+			String ODIN_ANDROID_NDK_TOOLCHAIN_LIB       = build_context.ODIN_ANDROID_NDK_TOOLCHAIN_LIB;
+			String ODIN_ANDROID_NDK_TOOLCHAIN_LIB_LEVEL = build_context.ODIN_ANDROID_NDK_TOOLCHAIN_LIB_LEVEL;
+			String ODIN_ANDROID_NDK_TOOLCHAIN_SYSROOT   = build_context.ODIN_ANDROID_NDK_TOOLCHAIN_SYSROOT;
+
 			// Link using `clang`, unless overridden by `ODIN_CLANG_PATH` environment variable.
 			const char* clang_path = gb_get_env("ODIN_CLANG_PATH", permanent_allocator());
 			if (clang_path == NULL) {
 				clang_path = "clang";
 			}

-			// NOTE(vassvik): get cwd, for used for local shared libs linking, since those have to be relative to the exe
-			char cwd[256];
-			#if !defined(GB_SYSTEM_WINDOWS)
-			getcwd(&cwd[0], 256);
-			#endif
-			//printf("%s\n", cwd);
-
 			// NOTE(vassvik): needs to add the root to the library search paths, so that the full filenames of the library
 			//                files can be passed with -l:
-			gbString lib_str = gb_string_make(heap_allocator(), "-L/");
+			gbString lib_str = gb_string_make(heap_allocator(), "");
 			defer (gb_string_free(lib_str));
+			#if !defined(GB_SYSTEM_WINDOWS)
+				lib_str = gb_string_appendc(lib_str, "-L/ ");
+			#endif
 			
 			StringSet asm_files = {};
 			string_set_init(&asm_files, 64);
@@ -496,19 +504,20 @@ gb_internal i32 linker_stage(LinkerData *gen) {
 						}

 						String obj_format;
-					#if defined(GB_ARCH_64_BIT)
-						if (is_osx) {
-							obj_format = str_lit("macho64");
+						if (build_context.metrics.ptr_size == 8) {
+							if (is_osx) {
+								obj_format = str_lit("macho64");
+							} else {
+								obj_format = str_lit("elf64");
+							}
 						} else {
-							obj_format = str_lit("elf64");
+							GB_ASSERT(build_context.metrics.ptr_size == 4);
+							if (is_osx) {
+								obj_format = str_lit("macho32");
+							} else {
+								obj_format = str_lit("elf32");
+							}
 						}
-					#elif defined(GB_ARCH_32_BIT)
-						if (is_osx) {
-							obj_format = str_lit("macho32");
-						} else {
-							obj_format = str_lit("elf32");
-						}
-					#endif // GB_ARCH_*_BIT

 						if (build_context.metrics.arch == TargetArch_riscv64) {
 							result = system_exec_command_line_app("clang",
@@ -618,6 +627,78 @@ gb_internal i32 linker_stage(LinkerData *gen) {

 			gbString object_files = gb_string_make(heap_allocator(), "");
 			defer (gb_string_free(object_files));
+
+
+			if (is_android) { // NOTE(bill): glue code needed for Android
+				TIME_SECTION("Android Native App Glue Compile");
+
+				String android_glue_object = {};
+				String android_glue_static_lib = {};
+
+				char hash_buf[64] = {};
+				gb_snprintf(hash_buf, gb_size_of(hash_buf), "%p", &hash_buf);
+				String hash = make_string_c(hash_buf);
+
+				String temp_dir = normalize_path(temporary_allocator(), temporary_directory(temporary_allocator()), NIX_SEPARATOR_STRING);
+				android_glue_object = concatenate4_strings(temporary_allocator(), temp_dir, str_lit("android_native_app_glue-"), hash, str_lit(".o"));
+				android_glue_static_lib = concatenate4_strings(permanent_allocator(), temp_dir, str_lit("libandroid_native_app_glue-"), hash, str_lit(".a"));
+
+				gbString glue = gb_string_make(heap_allocator(), clang_path);
+				defer (gb_string_free(glue));
+
+				glue = gb_string_append_fmt(glue, " --target=aarch64-linux-android%d ", ODIN_ANDROID_API_LEVEL);
+				glue = gb_string_appendc(glue, "-c \"");
+				glue = gb_string_append_length(glue, ODIN_ANDROID_NDK.text, ODIN_ANDROID_NDK.len);
+				glue = gb_string_appendc(glue, "sources/android/native_app_glue/android_native_app_glue.c");
+				glue = gb_string_appendc(glue, "\" ");
+				glue = gb_string_appendc(glue, "-o \"");
+				glue = gb_string_append_length(glue, android_glue_object.text, android_glue_object.len);
+				glue = gb_string_appendc(glue, "\" ");
+
+				glue = gb_string_appendc(glue, "\"-I");
+				glue = gb_string_append_length(glue, ODIN_ANDROID_NDK_TOOLCHAIN.text, ODIN_ANDROID_NDK_TOOLCHAIN.len);
+				glue = gb_string_appendc(glue, "sysroot/usr/include/");
+				glue = gb_string_appendc(glue, "\" ");
+
+				glue = gb_string_appendc(glue, "\"-I");
+				glue = gb_string_append_length(glue, ODIN_ANDROID_NDK_TOOLCHAIN.text, ODIN_ANDROID_NDK_TOOLCHAIN.len);
+				glue = gb_string_appendc(glue, "sysroot/usr/include/aarch64-linux-android/");
+				glue = gb_string_appendc(glue, "\" ");
+
+
+				glue = gb_string_appendc(glue, "-Wno-macro-redefined ");
+
+				result = system_exec_command_line_app("android-native-app-glue-compile", glue);
+				if (result) {
+					return result;
+				}
+
+				TIME_SECTION("Android Native App Glue ar");
+
+				gbString ar = gb_string_make_length(heap_allocator(), ODIN_ANDROID_NDK_TOOLCHAIN.text, ODIN_ANDROID_NDK_TOOLCHAIN.len);
+				defer (gb_string_free(ar));
+
+				ar = gb_string_appendc(ar, "bin/llvm-ar");
+
+				ar = gb_string_appendc(ar, " rcs ");
+
+				ar = gb_string_appendc(ar, "\"");
+				ar = gb_string_append_length(ar, android_glue_static_lib.text, android_glue_static_lib.len);
+				ar = gb_string_appendc(ar, "\" ");
+
+				ar = gb_string_appendc(ar, "\"");
+				ar = gb_string_append_length(ar, android_glue_object.text, android_glue_object.len);
+				ar = gb_string_appendc(ar, "\" ");
+
+				result = system_exec_command_line_app("android-native-app-glue-ar", ar);
+				if (result) {
+					return result;
+				}
+
+				object_files = gb_string_append_fmt(object_files, "\"%.*s\" ", LIT(android_glue_static_lib));
+			}
+
+
 			for (String object_path : gen->output_object_paths) {
 				object_files = gb_string_append_fmt(object_files, "\"%.*s\" ", LIT(object_path));
 			}
@@ -661,7 +742,9 @@ gb_internal i32 linker_stage(LinkerData *gen) {
 					link_settings = gb_string_appendc(link_settings, "-Wl,-init,'_odin_entry_point' ");
 					link_settings = gb_string_appendc(link_settings, "-Wl,-fini,'_odin_exit_point' ");
 				}
-
+			} else if (is_android) {
+				// Always shared even in android!
+				link_settings = gb_string_appendc(link_settings, "-shared ");
 			}

 			if (build_context.build_mode == BuildMode_Executable && build_context.reloc_mode == RelocMode_PIC) {
@@ -670,6 +753,7 @@ gb_internal i32 linker_stage(LinkerData *gen) {
 				if (build_context.metrics.os != TargetOs_openbsd
 					&& build_context.metrics.os != TargetOs_haiku
 					&& build_context.metrics.arch != TargetArch_riscv64
+					&& !is_android
 				) {
 					// OpenBSD and Haiku default to PIE executable. do not pass -no-pie for it.
 					link_settings = gb_string_appendc(link_settings, "-no-pie ");
@@ -701,6 +785,29 @@ gb_internal i32 linker_stage(LinkerData *gen) {
 					// This points the linker to where the entry point is
 					link_settings = gb_string_appendc(link_settings, "-e _main ");
 				}
+			} else if (build_context.metrics.os == TargetOs_openbsd) {
+				// OpenBSD ports install shared libraries in /usr/local/lib. Also, we must explicitly link libpthread.
+				platform_lib_str = gb_string_appendc(platform_lib_str, "-lpthread -Wl,-L/usr/local/lib ");
+				// Until the LLVM back-end can be adapted to emit endbr64 instructions on amd64, we
+				// need to pass -z nobtcfi in order to allow the resulting program to run under
+				// OpenBSD 7.4 and newer. Once support is added at compile time, this can be dropped.
+				platform_lib_str = gb_string_appendc(platform_lib_str, "-Wl,-z,nobtcfi ");
+			}
+
+			if (is_android) {
+				GB_ASSERT(ODIN_ANDROID_NDK_TOOLCHAIN_LIB.len != 0);
+				GB_ASSERT(ODIN_ANDROID_NDK_TOOLCHAIN_LIB_LEVEL.len != 0);
+				GB_ASSERT(ODIN_ANDROID_NDK_TOOLCHAIN_SYSROOT.len != 0);
+
+				platform_lib_str = gb_string_appendc(platform_lib_str, "\"-L");
+				platform_lib_str = gb_string_append_length(platform_lib_str, ODIN_ANDROID_NDK_TOOLCHAIN_LIB_LEVEL.text, ODIN_ANDROID_NDK_TOOLCHAIN_LIB_LEVEL.len);
+				platform_lib_str = gb_string_appendc(platform_lib_str, "\" ");
+
+				platform_lib_str = gb_string_appendc(platform_lib_str, "\"--sysroot=");
+				platform_lib_str = gb_string_append_length(platform_lib_str, ODIN_ANDROID_NDK_TOOLCHAIN_SYSROOT.text, ODIN_ANDROID_NDK_TOOLCHAIN_SYSROOT.len);
+				platform_lib_str = gb_string_appendc(platform_lib_str, "\" ");
+
+				link_settings = gb_string_appendc(link_settings, "-u ANativeActivity_onCreate ");
 			}

 			if (!build_context.no_rpath) {
@@ -709,24 +816,31 @@ gb_internal i32 linker_stage(LinkerData *gen) {
 				if (build_context.metrics.os == TargetOs_darwin) {
 					link_settings = gb_string_appendc(link_settings, "-Wl,-rpath,@loader_path ");
 				} else {
-					link_settings = gb_string_appendc(link_settings, "-Wl,-rpath,\\$ORIGIN ");
+					if (is_android) {
+						// ignore
+					} else {
+						link_settings = gb_string_appendc(link_settings, "-Wl,-rpath,\\$ORIGIN ");
+					}
 				}
 			}

 			if (!build_context.no_crt) {
-				platform_lib_str = gb_string_appendc(platform_lib_str, "-lm ");
+				lib_str = gb_string_appendc(lib_str, "-lm ");
 				if (build_context.metrics.os == TargetOs_darwin) {
 					// NOTE: adding this causes a warning about duplicate libraries, I think it is
 					// automatically assumed/added by clang when you don't do `-nostdlib`.
-					// platform_lib_str = gb_string_appendc(platform_lib_str, "-lSystem ");
+					// lib_str = gb_string_appendc(lib_str, "-lSystem ");
 				} else {
-					platform_lib_str = gb_string_appendc(platform_lib_str, "-lc ");
+					lib_str = gb_string_appendc(lib_str, "-lc ");
 				}
 			}

 			gbString link_command_line = gb_string_make(heap_allocator(), clang_path);
 			defer (gb_string_free(link_command_line));

+			if (is_android) {
+				link_command_line = gb_string_append_fmt(link_command_line, " --target=aarch64-linux-android%d ", ODIN_ANDROID_API_LEVEL);
+			}
 			link_command_line = gb_string_appendc(link_command_line, " -Wno-unused-command-line-argument ");
 			link_command_line = gb_string_appendc(link_command_line, object_files);
 			link_command_line = gb_string_append_fmt(link_command_line, " -o \"%.*s\" ", LIT(output_filename));
@@ -736,6 +850,11 @@ gb_internal i32 linker_stage(LinkerData *gen) {
 			link_command_line = gb_string_append_fmt(link_command_line, " %.*s ", LIT(build_context.extra_linker_flags));
 			link_command_line = gb_string_append_fmt(link_command_line, " %s ", link_settings);

+
+			if (is_android) {
+				TIME_SECTION("Linking");
+			}
+
 			if (build_context.linker_choice == Linker_lld) {
 				link_command_line = gb_string_append_fmt(link_command_line, " -fuse-ld=lld");
 				result = system_exec_command_line_app("lld-link", link_command_line);
@@ -1126,30 +1126,51 @@ gb_internal lbProcedure *lb_create_objc_names(lbModule *main_module) {
 	return p;
 }

-gb_internal void lb_finalize_objc_names(lbProcedure *p) {
+gb_internal void lb_finalize_objc_names(lbGenerator *gen, lbProcedure *p) {
 	if (p == nullptr) {
 		return;
 	}
 	lbModule *m = p->module;
+	GB_ASSERT(m == &p->module->gen->default_module);

 	TEMPORARY_ALLOCATOR_GUARD();

+	StringSet handled = {};
+	string_set_init(&handled);
+	defer (string_set_destroy(&handled));
+
 	auto args = array_make<lbValue>(temporary_allocator(), 1);

 	LLVMSetLinkage(p->value, LLVMInternalLinkage);
 	lb_begin_procedure_body(p);
-	for (auto const &entry : m->objc_classes) {
-		String name = entry.key;
-		args[0] = lb_const_value(m, t_cstring, exact_value_string(name));
-		lbValue ptr = lb_emit_runtime_call(p, "objc_lookUpClass", args);
-		lb_addr_store(p, entry.value.local_module_addr, ptr);
+
+	auto register_thing = [&handled, &m, &args](lbProcedure *p, lbObjCGlobal const &g, char const *call) {
+		if (!string_set_update(&handled, g.name)) {
+			lbAddr addr = {};
+			lbValue *found = string_map_get(&m->members, g.global_name);
+			if (found) {
+				addr = lb_addr(*found);
+			} else {
+				lbValue v = {};
+				LLVMTypeRef t = lb_type(m, g.type);
+				v.value = LLVMAddGlobal(m->mod, t, g.global_name);
+				v.type = alloc_type_pointer(g.type);
+				addr = lb_addr(v);
+				LLVMSetInitializer(v.value, LLVMConstNull(t));
+			}
+
+			args[0] = lb_const_value(m, t_cstring, exact_value_string(g.name));
+			lbValue ptr = lb_emit_runtime_call(p, call, args);
+			lb_addr_store(p, addr, ptr);
+		}
+	};
+
+	for (lbObjCGlobal g = {}; mpsc_dequeue(&gen->objc_classes, &g); /**/) {
+		register_thing(p, g, "objc_lookUpClass");
 	}

-	for (auto const &entry : m->objc_selectors) {
-		String name = entry.key;
-		args[0] = lb_const_value(m, t_cstring, exact_value_string(name));
-		lbValue ptr = lb_emit_runtime_call(p, "sel_registerName", args);
-		lb_addr_store(p, entry.value.local_module_addr, ptr);
+	for (lbObjCGlobal g = {}; mpsc_dequeue(&gen->objc_selectors, &g); /**/) {
+		register_thing(p, g, "sel_registerName");
 	}

 	lb_end_procedure_body(p);
@@ -2637,7 +2658,7 @@ gb_internal bool lb_generate_code(lbGenerator *gen) {

 	if (gen->objc_names) {
 		TIME_SECTION("Finalize objc names");
-		lb_finalize_objc_names(gen->objc_names);
+		lb_finalize_objc_names(gen, gen->objc_names);
 	}

 	if (build_context.ODIN_DEBUG) {
@@ -143,11 +143,6 @@ struct lbPadType {
 	LLVMTypeRef type;
 };

-struct lbObjcRef {
-	Entity * entity;
-	lbAddr local_module_addr;
-};
-
 struct lbModule {
 	LLVMModuleRef mod;
 	LLVMContextRef ctx;
@@ -198,8 +193,9 @@ struct lbModule {
 	RecursiveMutex debug_values_mutex;
 	PtrMap<void *, LLVMMetadataRef> debug_values; 

-	StringMap<lbObjcRef> objc_classes;
-	StringMap<lbObjcRef> objc_selectors;
+
+	StringMap<lbAddr> objc_classes;
+	StringMap<lbAddr> objc_selectors;

 	PtrMap<u64/*type hash*/, lbAddr> map_cell_info_map; // address of runtime.Map_Info
 	PtrMap<u64/*type hash*/, lbAddr> map_info_map;      // address of runtime.Map_Cell_Info
@@ -218,6 +214,13 @@ struct lbEntityCorrection {
 	char const *cname;
 };

+struct lbObjCGlobal {
+	lbModule *module;
+	gbString  global_name;
+	String    name;
+	Type *    type;
+};
+
 struct lbGenerator : LinkerData {
 	CheckerInfo *info;

@@ -235,6 +238,8 @@ struct lbGenerator : LinkerData {
 	lbProcedure *objc_names;

 	MPSCQueue<lbEntityCorrection> entities_to_correct_linkage;
+	MPSCQueue<lbObjCGlobal> objc_selectors;
+	MPSCQueue<lbObjCGlobal> objc_classes;
 };


@@ -33,7 +33,7 @@ gb_internal bool lb_is_elem_const(Ast *elem, Type *elem_type) {

 gb_internal bool lb_is_const_nil(lbValue value) {
 	LLVMValueRef v = value.value;
-	if (LLVMIsConstant(v)) {
+	if (v != nullptr && LLVMIsConstant(v)) {
 		if (LLVMIsAConstantAggregateZero(v)) {
 			return true;
 		} else if (LLVMIsAConstantPointerNull(v)) {
@@ -1125,10 +1125,11 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 							visited[index] = true;
 						} else {
 							if (!visited[index]) {
-								values[index]  = lb_const_value(m, f->type, {}, false).value;
+								values[index]  = lb_const_value(m, f->type, {}, /*allow_local*/false, is_rodata).value;
 								visited[index] = true;
 							}

+
 							unsigned idx_list_len = cast(unsigned)sel.index.count-1;
 							unsigned *idx_list = gb_alloc_array(temporary_allocator(), unsigned, idx_list_len);

@@ -1139,6 +1140,7 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 									i32 index = sel.index[j];
 									Type *cvt = base_type(cv_type);

+
 									if (cvt->kind == Type_Struct) {
 										if (cvt->Struct.is_raw_union) {
 											// sanity check which should have been caught by `lb_is_nested_possibly_constant`
@@ -1164,8 +1166,40 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 								}
 								if (is_constant) {
 									LLVMValueRef elem_value = lb_const_value(m, tav.type, tav.value, allow_local, is_rodata).value;
-									if (LLVMIsConstant(elem_value)) {
+									if (LLVMIsConstant(elem_value) && LLVMIsConstant(values[index])) {
 										values[index] = llvm_const_insert_value(m, values[index], elem_value, idx_list, idx_list_len);
+									} else if (is_local) {
+									#if 1
+										lbProcedure *p = m->curr_procedure;
+										GB_ASSERT(p != nullptr);
+										if (LLVMIsConstant(values[index])) {
+											lbAddr addr = lb_add_local_generated(p, f->type, false);
+											lb_addr_store(p, addr, lbValue{values[index], f->type});
+											values[index] = lb_addr_load(p, addr).value;
+										}
+
+										GB_ASSERT(LLVMIsALoadInst(values[index]));
+
+										LLVMValueRef ptr = LLVMGetOperand(values[index], 0);
+
+										LLVMValueRef *indices = gb_alloc_array(temporary_allocator(), LLVMValueRef, idx_list_len);
+										LLVMTypeRef lt_u32 = lb_type(m, t_u32);
+										for (unsigned i = 0; i < idx_list_len; i++) {
+											indices[i] = LLVMConstInt(lt_u32, idx_list[i], false);
+										}
+
+										ptr = LLVMBuildGEP2(p->builder, lb_type(m, f->type), ptr, indices, idx_list_len, "");
+										ptr = LLVMBuildPointerCast(p->builder, ptr, lb_type(m, alloc_type_pointer(tav.type)), "");
+
+										if (LLVMIsALoadInst(elem_value)) {
+											i64 sz = type_size_of(tav.type);
+											LLVMValueRef src = LLVMGetOperand(elem_value, 0);
+											lb_mem_copy_non_overlapping(p, {ptr, t_rawptr}, {src, t_rawptr}, lb_const_int(m, t_int, sz), false);
+										} else {
+											LLVMBuildStore(p->builder, elem_value, ptr);
+										}
+									#endif
+										is_constant = false;
 									} else {
 										is_constant = false;
 									}
@@ -1205,7 +1239,7 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 				LLVMValueRef val = values[i];
 				if (!LLVMIsConstant(val)) {
 					GB_ASSERT(is_local);
-					GB_ASSERT(LLVMGetInstructionOpcode(val) == LLVMLoad);
+					GB_ASSERT(LLVMIsALoadInst(val));
 					is_constant = false;
 				}
 			}
@@ -1237,7 +1271,15 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 					LLVMValueRef val = old_values[i];
 					if (!LLVMIsConstant(val)) {
 						LLVMValueRef dst = LLVMBuildStructGEP2(p->builder, llvm_addr_type(p->module, v.addr), v.addr.value, cast(unsigned)i, "");
+						// if (LLVMIsALoadInst(val)) {
+						// 	Type *ptr_type = v.addr.type;
+						// 	i64 sz = type_size_of(type_deref(ptr_type));
+
+						// 	LLVMValueRef src = LLVMGetOperand(val, 0);
+						// 	lb_mem_copy_non_overlapping(p, {dst, ptr_type}, {src, ptr_type}, lb_const_int(m, t_int, sz), false);
+						// } else {
 						LLVMBuildStore(p->builder, val, dst);
+						// }
 					}
 				}
 				return lb_addr_load(p, v);
@@ -1089,7 +1089,7 @@ gb_internal void lb_add_debug_local_variable(lbProcedure *p, LLVMValueRef ptr, T
 #if LLVM_VERSION_MAJOR <= 18
 	LLVMDIBuilderInsertDeclareAtEnd(m->debug_builder, storage, var_info, llvm_expr, llvm_debug_loc, block);
 #else
-	LLVMDIBuilderInsertDbgValueRecordAtEnd(m->debug_builder, storage, var_info, llvm_expr, llvm_debug_loc, block);
+	LLVMDIBuilderInsertDeclareRecordAtEnd(m->debug_builder, storage, var_info, llvm_expr, llvm_debug_loc, block);
 #endif
 }

@@ -3493,7 +3493,8 @@ gb_internal lbValue lb_build_expr_internal(lbProcedure *p, Ast *expr) {

 	if (tv.value.kind != ExactValue_Invalid) {
 		// NOTE(bill): Short on constant values
-		return lb_const_value(p->module, type, tv.value);
+		bool allow_local = true;
+		return lb_const_value(p->module, type, tv.value, allow_local);
 	} else if (tv.mode == Addressing_Type) {
 		// NOTE(bill, 2023-01-16): is this correct? I hope so at least
 		return lb_typeid(m, tv.type);
--- a/Show More
+++ b/Show More