Merge remote-tracking branch 'offical/master'

2026-07-25 08:57:55 +00:00 · 2024-09-09 13:15:00 -04:00
parent 975f7ba295 d783bca297
commit 10861d53c0
492 changed files with 44673 additions and 6618 deletions
@@ -18,7 +18,7 @@ jobs:
        usesh: true
        copyback: false
        prepare: |
-          PKG_PATH="https://cdn.NetBSD.org/pub/pkgsrc/packages/NetBSD/$(uname -p)/10.0_2024Q2/All" /usr/sbin/pkg_add pkgin
+          PKG_PATH="https://cdn.NetBSD.org/pub/pkgsrc/packages/NetBSD/amd64/$(uname -r | cut -d_ -f1)_${PKGSRC_BRANCH}/All" /usr/sbin/pkg_add pkgin
          pkgin -y in gmake git bash python311 llvm clang
          ln -s /usr/pkg/bin/python3.11 /usr/bin/python3
        run: |
@@ -32,10 +32,9 @@ jobs:
          gmake -C vendor/miniaudio/src
          ./odin check examples/all -vet -strict-style -disallow-do -target:netbsd_amd64
          ./odin check examples/all -vet -strict-style -disallow-do -target:netbsd_arm64
-          ./odin test tests/core/normal.odin -file -all-packages -define:ODIN_TEST_FANCY=false
-          ./odin test tests/core/speed.odin -file -all-packages -o:speed -define:ODIN_TEST_FANCY=false
-          ./odin test tests/vendor -all-packages -define:ODIN_TEST_FANCY=false
-          ./odin test tests/benchmark -all-packages -define:ODIN_TEST_FANCY=false
+          ./odin test tests/core/normal.odin -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
+          ./odin test tests/core/speed.odin -file -all-packages -vet -strict-style -disallow-do -o:speed -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
+          ./odin test tests/vendor -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
          (cd tests/issues; ./run.sh)
  build_freebsd:
    name: FreeBSD Build, Check, and Test
@@ -61,10 +60,9 @@ jobs:
          gmake -C vendor/cgltf/src
          gmake -C vendor/miniaudio/src
          ./odin check examples/all -vet -strict-style -disallow-do -target:freebsd_amd64
-          ./odin test tests/core/normal.odin -file -all-packages -define:ODIN_TEST_FANCY=false
-          ./odin test tests/core/speed.odin -file -all-packages -o:speed -define:ODIN_TEST_FANCY=false
-          ./odin test tests/vendor -all-packages -define:ODIN_TEST_FANCY=false
-          ./odin test tests/benchmark -all-packages -define:ODIN_TEST_FANCY=false
+          ./odin test tests/core/normal.odin -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
+          ./odin test tests/core/speed.odin -file -all-packages -vet -strict-style -disallow-do -o:speed -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
+          ./odin test tests/vendor -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
          (cd tests/issues; ./run.sh)
  ci:
    strategy:
@@ -118,15 +116,13 @@ jobs:
      - name: Odin check examples/all
        run: ./odin check examples/all -strict-style
      - name: Normal Core library tests
-        run: ./odin test tests/core/normal.odin -file -all-packages -define:ODIN_TEST_FANCY=false
+        run: ./odin test tests/core/normal.odin -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
      - name: Optimized Core library tests
-        run: ./odin test tests/core/speed.odin -o:speed -file -all-packages -define:ODIN_TEST_FANCY=false
+        run: ./odin test tests/core/speed.odin -o:speed -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
      - name: Vendor library tests
-        run: ./odin test tests/vendor -all-packages -define:ODIN_TEST_FANCY=false
+        run: ./odin test tests/vendor -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
      - name: Internals tests
-        run: ./odin test tests/internal -all-packages -define:ODIN_TEST_FANCY=false
-      - name: Core library benchmarks
-        run: ./odin test tests/benchmark -all-packages -define:ODIN_TEST_FANCY=false
+        run: ./odin test tests/internal -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
      - name: GitHub Issue tests
        run: |
          cd tests/issues
@@ -180,38 +176,33 @@ jobs:
        shell: cmd
        run: |
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat
-          odin run examples/demo -debug
+          odin run examples/demo -debug -vet -strict-style -disallow-do
      - name: Odin check examples/all
        shell: cmd
        run: |
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat
-          odin check examples/all -strict-style
+          odin check examples/all -vet -strict-style -disallow-do
      - name: Core library tests
        shell: cmd
        run: |
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat
-          odin test tests/core/normal.odin -file -all-packages -define:ODIN_TEST_FANCY=false
+          odin test tests/core/normal.odin -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
      - name: Optimized core library tests
        shell: cmd
        run: |
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat
-          odin test tests/core/speed.odin -o:speed -file -all-packages -define:ODIN_TEST_FANCY=false
-      - name: Core library benchmarks
-        shell: cmd
-        run: |
-          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat
-          odin test tests/benchmark -all-packages -define:ODIN_TEST_FANCY=false
+          odin test tests/core/speed.odin -o:speed -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
      - name: Vendor library tests
        shell: cmd
        run: |
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat
          copy vendor\lua\5.4\windows\*.dll .
-          odin test tests/vendor -all-packages -define:ODIN_TEST_FANCY=false
+          odin test tests/vendor -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
      - name: Odin internals tests
        shell: cmd
        run: |
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat
-          odin test tests/internal -all-packages -define:ODIN_TEST_FANCY=false
+          odin test tests/internal -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
      - name: Odin documentation tests
        shell: cmd
        run: |
@@ -229,3 +220,53 @@ jobs:
        run: |
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat
          odin check examples/all -strict-style -target:windows_i386
+
+  build_linux_riscv64:
+    runs-on: ubuntu-latest
+    name: Linux riscv64 (emulated) Build, Check and Test
+    timeout-minutes: 15
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Download LLVM (Linux)
+        run: |
+          wget https://apt.llvm.org/llvm.sh
+          chmod +x llvm.sh
+          sudo ./llvm.sh 18
+          echo "/usr/lib/llvm-18/bin" >> $GITHUB_PATH
+
+      - name: Build Odin
+        run: ./build_odin.sh release
+
+      - name: Odin version
+        run: ./odin version
+
+      - name: Odin report
+        run: ./odin report
+
+      - name: Compile needed Vendor
+        run: |
+          make -C vendor/stb/src
+          make -C vendor/cgltf/src
+          make -C vendor/miniaudio/src
+
+      - name: Odin check
+        run: ./odin check examples/all -target:linux_riscv64 -vet -strict-style -disallow-do
+
+      - name: Install riscv64 toolchain and qemu
+        run: sudo apt-get install -y qemu-user qemu-user-static gcc-12-riscv64-linux-gnu libc6-riscv64-cross
+
+      - name: Odin run
+        run: ./odin run examples/demo -vet -strict-style -disallow-do -target:linux_riscv64 -extra-linker-flags:"-fuse-ld=/usr/bin/riscv64-linux-gnu-gcc-12 -static -Wl,-static"
+
+      - name: Odin run -debug
+        run: ./odin run examples/demo -debug -vet -strict-style -disallow-do -target:linux_riscv64 -extra-linker-flags:"-fuse-ld=/usr/bin/riscv64-linux-gnu-gcc-12 -static -Wl,-static"
+
+      - name: Normal Core library tests
+        run: ./odin test tests/core/normal.odin -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true -target:linux_riscv64 -extra-linker-flags:"-fuse-ld=/usr/bin/riscv64-linux-gnu-gcc-12 -static -Wl,-static"
+
+      - name: Optimized Core library tests
+        run: ./odin test tests/core/speed.odin -o:speed -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true -target:linux_riscv64 -extra-linker-flags:"-fuse-ld=/usr/bin/riscv64-linux-gnu-gcc-12 -static -Wl,-static"
+
+      - name: Internals tests
+        run: ./odin test tests/internal -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true -target:linux_riscv64 -extra-linker-flags:"-fuse-ld=/usr/bin/riscv64-linux-gnu-gcc-12 -static -Wl,-static"
@@ -61,7 +61,6 @@ jobs:
          mkdir dist
          cp odin dist
          cp LICENSE dist
-          cp libLLVM* dist
          cp -r shared dist
          cp -r base dist
          cp -r core dist
@@ -17,13 +17,12 @@
 [Rr]eleases/
 x64/
 x86/
+!/core/simd/x86
 bld/
 [Bb]in/
 [Oo]bj/
 [Ll]og/
 ![Cc]ore/[Ll]og/
-tests/documentation/verify/
-tests/documentation/all.odin-doc
 # Visual Studio 2015 cache/options directory
 .vs/
 # Visual Studio Code options directory
@@ -31,7 +30,6 @@ tests/documentation/all.odin-doc
 # Uncomment if you have tasks that create the project's static files in wwwroot
 #wwwroot/
 demo
-benchmark

 # MSTest test Results
 [Tt]est[Rr]esult*/
@@ -1,4 +1,4 @@
-all: debug
+all: default

 demo:
 	./odin run examples/demo/demo.odin -file
@@ -6,12 +6,18 @@ demo:
 report:
 	./odin report

+default:
+	PROGRAM=make ./build_odin.sh # debug
+
 debug:
 	./build_odin.sh debug

 release:
 	./build_odin.sh release

+release-native:
+	./build_odin.sh release-native
+
 release_native:
 	./build_odin.sh release-native

@@ -76,9 +76,9 @@ Answers to common questions about Odin.

 Documentation for all the official packages part of the [core](https://pkg.odin-lang.org/core/) and [vendor](https://pkg.odin-lang.org/vendor/) library collections.

-#### [The Odin Wiki](https://github.com/odin-lang/Odin/wiki)
+#### [Odin Documentation](https://odin-lang.org/docs/)

-A wiki maintained by the Odin community.
+Documentation for the Odin language itself.

 #### [Odin Discord](https://discord.gg/sVBPHEv)

@@ -42,8 +42,8 @@ overflow_add :: proc(lhs, rhs: $T) -> (T, bool) where type_is_integer(T) #option
 overflow_sub :: proc(lhs, rhs: $T) -> (T, bool) where type_is_integer(T) #optional_ok ---
 overflow_mul :: proc(lhs, rhs: $T) -> (T, bool) where type_is_integer(T) #optional_ok ---

-add_sat :: proc(lhs, rhs: $T) -> T where type_is_integer(T) ---
-sub_sat :: proc(lhs, rhs: $T) -> T where type_is_integer(T) ---
+saturating_add :: proc(lhs, rhs: $T) -> T where type_is_integer(T) ---
+saturating_sub :: proc(lhs, rhs: $T) -> T where type_is_integer(T) ---

 sqrt :: proc(x: $T) -> T where type_is_float(T) || (type_is_simd_vector(T) && type_is_float(type_elem_type(T))) ---

@@ -219,14 +219,21 @@ type_map_cell_info :: proc($T: typeid)           -> ^runtime.Map_Cell_Info ---
 type_convert_variants_to_pointers :: proc($T: typeid) -> typeid where type_is_union(T) ---
 type_merge :: proc($U, $V: typeid) -> typeid where type_is_union(U), type_is_union(V) ---

+type_has_shared_fields :: proc($U, $V: typeid) -> bool typeid where type_is_struct(U), type_is_struct(V) ---
+
 constant_utf16_cstring :: proc($literal: string) -> [^]u16 ---

+constant_log2 :: proc($v: $T) -> T where type_is_integer(T) ---
+
 // SIMD related
 simd_add  :: proc(a, b: #simd[N]T) -> #simd[N]T ---
 simd_sub  :: proc(a, b: #simd[N]T) -> #simd[N]T ---
 simd_mul  :: proc(a, b: #simd[N]T) -> #simd[N]T ---
 simd_div  :: proc(a, b: #simd[N]T) -> #simd[N]T where type_is_float(T) ---

+simd_saturating_add  :: proc(a, b: #simd[N]T) -> #simd[N]T where type_is_integer(T) ---
+simd_saturating_sub  :: proc(a, b: #simd[N]T) -> #simd[N]T where type_is_integer(T) ---
+
 // Keeps Odin's Behaviour
 // (x << y) if y <= mask else 0
 simd_shl :: proc(a: #simd[N]T, b: #simd[N]Unsigned_Integer) -> #simd[N]T ---
@@ -237,9 +244,6 @@ simd_shr :: proc(a: #simd[N]T, b: #simd[N]Unsigned_Integer) -> #simd[N]T ---
 simd_shl_masked :: proc(a: #simd[N]T, b: #simd[N]Unsigned_Integer) -> #simd[N]T ---
 simd_shr_masked :: proc(a: #simd[N]T, b: #simd[N]Unsigned_Integer) -> #simd[N]T ---

-simd_add_sat :: proc(a, b: #simd[N]T) -> #simd[N]T ---
-simd_sub_sat :: proc(a, b: #simd[N]T) -> #simd[N]T ---
-
 simd_bit_and     :: proc(a, b: #simd[N]T) -> #simd[N]T ---
 simd_bit_or      :: proc(a, b: #simd[N]T) -> #simd[N]T ---
 simd_bit_xor     :: proc(a, b: #simd[N]T) -> #simd[N]T ---
@@ -268,13 +272,28 @@ simd_lanes_ge :: proc(a, b: #simd[N]T) -> #simd[N]Integer ---
 simd_extract :: proc(a: #simd[N]T, idx: uint) -> T ---
 simd_replace :: proc(a: #simd[N]T, idx: uint, elem: T) -> #simd[N]T ---

-simd_reduce_add_ordered :: proc(a: #simd[N]T) -> T ---
-simd_reduce_mul_ordered :: proc(a: #simd[N]T) -> T ---
-simd_reduce_min         :: proc(a: #simd[N]T) -> T ---
-simd_reduce_max         :: proc(a: #simd[N]T) -> T ---
-simd_reduce_and         :: proc(a: #simd[N]T) -> T ---
-simd_reduce_or          :: proc(a: #simd[N]T) -> T ---
-simd_reduce_xor         :: proc(a: #simd[N]T) -> T ---
+simd_reduce_add_ordered :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
+simd_reduce_mul_ordered :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
+simd_reduce_min         :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
+simd_reduce_max         :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
+simd_reduce_and         :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
+simd_reduce_or          :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
+simd_reduce_xor         :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
+
+simd_reduce_any         :: proc(a: #simd[N]T) -> T where type_is_boolean(T) ---
+simd_reduce_all         :: proc(a: #simd[N]T) -> T where type_is_boolean(T) ---
+
+
+simd_gather       :: proc(ptr: #simd[N]rawptr, val: #simd[N]T, mask: #simd[N]U) -> #simd[N]T where type_is_integer(U) || type_is_boolean(U) ---
+simd_scatter      :: proc(ptr: #simd[N]rawptr, val: #simd[N]T, mask: #simd[N]U)              where type_is_integer(U) || type_is_boolean(U) ---
+
+simd_masked_load  :: proc(ptr: rawptr, val: #simd[N]T, mask: #simd[N]U) -> #simd[N]T where type_is_integer(U) || type_is_boolean(U) ---
+simd_masked_store :: proc(ptr: rawptr, val: #simd[N]T, mask: #simd[N]U)              where type_is_integer(U) || type_is_boolean(U) ---
+
+simd_masked_expand_load    :: proc(ptr: rawptr, val: #simd[N]T, mask: #simd[N]U) -> #simd[N]T where type_is_integer(U) || type_is_boolean(U) ---
+simd_masked_compress_store :: proc(ptr: rawptr, val: #simd[N]T, mask: #simd[N]U)              where type_is_integer(U) || type_is_boolean(U) ---
+
+

 simd_shuffle :: proc(a, b: #simd[N]T, indices: ..int) -> #simd[len(indices)]T ---
 simd_select  :: proc(cond: #simd[N]boolean_or_integer, true, false: #simd[N]T) -> #simd[N]T ---
@@ -288,11 +307,11 @@ simd_nearest :: proc(a: #simd[N]any_float) -> #simd[N]any_float ---

 simd_to_bits :: proc(v: #simd[N]T) -> #simd[N]Integer where size_of(T) == size_of(Integer), type_is_unsigned(Integer) ---

-// equivalent a swizzle with descending indices, e.g. reserve(a, 3, 2, 1, 0)
-simd_reverse :: proc(a: #simd[N]T) -> #simd[N]T ---
+// equivalent to a swizzle with descending indices, e.g. reserve(a, 3, 2, 1, 0)
+simd_lanes_reverse :: proc(a: #simd[N]T) -> #simd[N]T ---

-simd_rotate_left  :: proc(a: #simd[N]T, $offset: int) -> #simd[N]T ---
-simd_rotate_right :: proc(a: #simd[N]T, $offset: int) -> #simd[N]T ---
+simd_lanes_rotate_left  :: proc(a: #simd[N]T, $offset: int) -> #simd[N]T ---
+simd_lanes_rotate_right :: proc(a: #simd[N]T, $offset: int) -> #simd[N]T ---

 // Checks if the current target supports the given target features.
 //
@@ -546,10 +546,23 @@ Odin_OS_Type :: type_of(ODIN_OS)
 		arm64,
 		wasm32,
 		wasm64p32,
+		riscv64,
 	}
 */
 Odin_Arch_Type :: type_of(ODIN_ARCH)

+Odin_Arch_Types :: bit_set[Odin_Arch_Type]
+
+ALL_ODIN_ARCH_TYPES :: Odin_Arch_Types{
+	.amd64,
+	.i386,
+	.arm32,
+	.arm64,
+	.wasm32,
+	.wasm64p32,
+	.riscv64,
+}
+
 /*
 	// Defined internally by the compiler
 	Odin_Build_Mode_Type :: enum int {
@@ -573,6 +586,22 @@ Odin_Build_Mode_Type :: type_of(ODIN_BUILD_MODE)
 */
 Odin_Endian_Type :: type_of(ODIN_ENDIAN)

+Odin_OS_Types :: bit_set[Odin_OS_Type]
+
+ALL_ODIN_OS_TYPES :: Odin_OS_Types{
+	.Windows,
+	.Darwin,
+	.Linux,
+	.Essence,
+	.FreeBSD,
+	.OpenBSD,
+	.NetBSD,
+	.Haiku,
+	.WASI,
+	.JS,
+	.Orca,
+	.Freestanding,
+}

 /*
 	// Defined internally by the compiler
@@ -750,6 +779,10 @@ __init_context :: proc "contextless" (c: ^Context) {
 }

 default_assertion_failure_proc :: proc(prefix, message: string, loc: Source_Code_Location) -> ! {
+	default_assertion_contextless_failure_proc(prefix, message, loc)
+}
+
+default_assertion_contextless_failure_proc :: proc "contextless" (prefix, message: string, loc: Source_Code_Location) -> ! {
 	when ODIN_OS == .Freestanding {
 		// Do nothing
 	} else {
@@ -68,7 +68,7 @@ copy :: proc{copy_slice, copy_from_string}
 // Note: If you want the elements to remain in their order, use `ordered_remove`.
 // Note: If the index is out of bounds, this procedure will panic.
@builtin
-unordered_remove :: proc(array: ^$D/[dynamic]$T, index: int, loc := #caller_location) #no_bounds_check {
+unordered_remove :: proc(array: ^$D/[dynamic]$T, #any_int index: int, loc := #caller_location) #no_bounds_check {
 	bounds_check_error_loc(loc, index, len(array))
 	n := len(array)-1
 	if index != n {
@@ -82,7 +82,7 @@ unordered_remove :: proc(array: ^$D/[dynamic]$T, index: int, loc := #caller_loca
 // Note: If the elements do not have to remain in their order, prefer `unordered_remove`.
 // Note: If the index is out of bounds, this procedure will panic.
@builtin
-ordered_remove :: proc(array: ^$D/[dynamic]$T, index: int, loc := #caller_location) #no_bounds_check {
+ordered_remove :: proc(array: ^$D/[dynamic]$T, #any_int index: int, loc := #caller_location) #no_bounds_check {
 	bounds_check_error_loc(loc, index, len(array))
 	if index+1 < len(array) {
 		copy(array[index:], array[index+1:])
@@ -95,7 +95,7 @@ ordered_remove :: proc(array: ^$D/[dynamic]$T, index: int, loc := #caller_locati
 // Note: This is an O(N) operation.
 // Note: If the range is out of bounds, this procedure will panic.
@builtin
-remove_range :: proc(array: ^$D/[dynamic]$T, lo, hi: int, loc := #caller_location) #no_bounds_check {
+remove_range :: proc(array: ^$D/[dynamic]$T, #any_int lo, hi: int, loc := #caller_location) #no_bounds_check {
 	slice_expr_error_lo_hi_loc(loc, lo, hi, len(array))
 	n := max(hi-lo, 0)
 	if n > 0 {
@@ -350,7 +350,7 @@ _make_dynamic_array_len_cap :: proc(array: ^Raw_Dynamic_Array, size_of_elem, ali
 	return
 }

-// `make_map` allocates and initializes a dynamic array. Like `new`, the first argument is a type, not a value.
+// `make_map` allocates and initializes a map. Like `new`, the first argument is a type, not a value.
 // Unlike `new`, `make`'s return value is the same as the type of its argument, not a pointer to it.
 //
 // Note: Prefer using the procedure group `make`.
@@ -362,7 +362,7 @@ make_map :: proc($T: typeid/map[$K]$E, #any_int capacity: int = 1<<MAP_MIN_LOG2_
 	err = reserve_map(&m, capacity, loc)
 	return
 }
-// `make_multi_pointer` allocates and initializes a dynamic array. Like `new`, the first argument is a type, not a value.
+// `make_multi_pointer` allocates and initializes a multi-pointer. Like `new`, the first argument is a type, not a value.
 // Unlike `new`, `make`'s return value is the same as the type of its argument, not a pointer to it.
 //
 // This is "similar" to doing `raw_data(make([]E, len, allocator))`.
@@ -602,7 +602,7 @@ append_nothing :: proc(array: ^$T/[dynamic]$E, loc := #caller_location) -> (n: i


@builtin
-inject_at_elem :: proc(array: ^$T/[dynamic]$E, index: int, #no_broadcast arg: E, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
+inject_at_elem :: proc(array: ^$T/[dynamic]$E, #any_int index: int, #no_broadcast arg: E, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
 	if array == nil {
 		return
 	}
@@ -620,7 +620,7 @@ inject_at_elem :: proc(array: ^$T/[dynamic]$E, index: int, #no_broadcast arg: E,
 }

@builtin
-inject_at_elems :: proc(array: ^$T/[dynamic]$E, index: int, #no_broadcast args: ..E, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
+inject_at_elems :: proc(array: ^$T/[dynamic]$E, #any_int index: int, #no_broadcast args: ..E, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
 	if array == nil {
 		return
 	}
@@ -643,7 +643,7 @@ inject_at_elems :: proc(array: ^$T/[dynamic]$E, index: int, #no_broadcast args:
 }

@builtin
-inject_at_elem_string :: proc(array: ^$T/[dynamic]$E/u8, index: int, arg: string, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
+inject_at_elem_string :: proc(array: ^$T/[dynamic]$E/u8, #any_int index: int, arg: string, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
 	if array == nil {
 		return
 	}
@@ -668,7 +668,7 @@ inject_at_elem_string :: proc(array: ^$T/[dynamic]$E/u8, index: int, arg: string


@builtin
-assign_at_elem :: proc(array: ^$T/[dynamic]$E, index: int, arg: E, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
+assign_at_elem :: proc(array: ^$T/[dynamic]$E, #any_int index: int, arg: E, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
 	if index < len(array) {
 		array[index] = arg
 		ok = true
@@ -682,7 +682,7 @@ assign_at_elem :: proc(array: ^$T/[dynamic]$E, index: int, arg: E, loc := #calle


@builtin
-assign_at_elems :: proc(array: ^$T/[dynamic]$E, index: int, #no_broadcast args: ..E, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
+assign_at_elems :: proc(array: ^$T/[dynamic]$E, #any_int index: int, #no_broadcast args: ..E, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
 	new_size := index + len(args)
 	if len(args) == 0 {
 		ok = true
@@ -699,7 +699,7 @@ assign_at_elems :: proc(array: ^$T/[dynamic]$E, index: int, #no_broadcast args:


@builtin
-assign_at_elem_string :: proc(array: ^$T/[dynamic]$E/u8, index: int, arg: string, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
+assign_at_elem_string :: proc(array: ^$T/[dynamic]$E/u8, #any_int index: int, arg: string, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
 	new_size := index + len(arg)
 	if len(arg) == 0 {
 		ok = true
@@ -838,7 +838,7 @@ non_zero_resize_dynamic_array :: proc(array: ^$T/[dynamic]$E, #any_int length: i

 	Note: Prefer the procedure group `shrink`
 */
-shrink_dynamic_array :: proc(array: ^$T/[dynamic]$E, new_cap := -1, loc := #caller_location) -> (did_shrink: bool, err: Allocator_Error) {
+shrink_dynamic_array :: proc(array: ^$T/[dynamic]$E, #any_int new_cap := -1, loc := #caller_location) -> (did_shrink: bool, err: Allocator_Error) {
 	return _shrink_dynamic_array((^Raw_Dynamic_Array)(array), size_of(E), align_of(E), new_cap, loc)
 }

@@ -948,3 +948,30 @@ unimplemented :: proc(message := "", loc := #caller_location) -> ! {
 	}
 	p("not yet implemented", message, loc)
 }
+
+
+@builtin
+@(disabled=ODIN_DISABLE_ASSERT)
+assert_contextless :: proc "contextless" (condition: bool, message := "", loc := #caller_location) {
+	if !condition {
+		// NOTE(bill): This is wrapped in a procedure call
+		// to improve performance to make the CPU not
+		// execute speculatively, making it about an order of
+		// magnitude faster
+		@(cold)
+		internal :: proc "contextless" (message: string, loc: Source_Code_Location) {
+			default_assertion_contextless_failure_proc("runtime assertion", message, loc)
+		}
+		internal(message, loc)
+	}
+}
+
+@builtin
+panic_contextless :: proc "contextless" (message: string, loc := #caller_location) -> ! {
+	default_assertion_contextless_failure_proc("panic", message, loc)
+}
+
+@builtin
+unimplemented_contextless :: proc "contextless" (message := "", loc := #caller_location) -> ! {
+	default_assertion_contextless_failure_proc("not yet implemented", message, loc)
+}
@@ -76,7 +76,7 @@ raw_soa_footer :: proc{


@(builtin, require_results)
-make_soa_aligned :: proc($T: typeid/#soa[]$E, length: int, alignment: int, allocator := context.allocator, loc := #caller_location) -> (array: T, err: Allocator_Error) #optional_allocator_error {
+make_soa_aligned :: proc($T: typeid/#soa[]$E, #any_int length, alignment: int, allocator := context.allocator, loc := #caller_location) -> (array: T, err: Allocator_Error) #optional_allocator_error {
 	if length <= 0 {
 		return
 	}
@@ -135,7 +135,7 @@ make_soa_aligned :: proc($T: typeid/#soa[]$E, length: int, alignment: int, alloc
 }

@(builtin, require_results)
-make_soa_slice :: proc($T: typeid/#soa[]$E, length: int, allocator := context.allocator, loc := #caller_location) -> (array: T, err: Allocator_Error) #optional_allocator_error {
+make_soa_slice :: proc($T: typeid/#soa[]$E, #any_int length: int, allocator := context.allocator, loc := #caller_location) -> (array: T, err: Allocator_Error) #optional_allocator_error {
 	return make_soa_aligned(T, length, align_of(E), allocator, loc)
 }

@@ -172,7 +172,7 @@ make_soa :: proc{


@builtin
-resize_soa :: proc(array: ^$T/#soa[dynamic]$E, length: int, loc := #caller_location) -> Allocator_Error {
+resize_soa :: proc(array: ^$T/#soa[dynamic]$E, #any_int length: int, loc := #caller_location) -> Allocator_Error {
 	if array == nil {
 		return nil
 	}
@@ -183,7 +183,7 @@ resize_soa :: proc(array: ^$T/#soa[dynamic]$E, length: int, loc := #caller_locat
 }

@builtin
-non_zero_resize_soa :: proc(array: ^$T/#soa[dynamic]$E, length: int, loc := #caller_location) -> Allocator_Error {
+non_zero_resize_soa :: proc(array: ^$T/#soa[dynamic]$E, #any_int length: int, loc := #caller_location) -> Allocator_Error {
 	if array == nil {
 		return nil
 	}
@@ -194,12 +194,12 @@ non_zero_resize_soa :: proc(array: ^$T/#soa[dynamic]$E, length: int, loc := #cal
 }

@builtin
-reserve_soa :: proc(array: ^$T/#soa[dynamic]$E, capacity: int, loc := #caller_location) -> Allocator_Error {
+reserve_soa :: proc(array: ^$T/#soa[dynamic]$E, #any_int capacity: int, loc := #caller_location) -> Allocator_Error {
 	return _reserve_soa(array, capacity, true, loc)
 }

@builtin
-non_zero_reserve_soa :: proc(array: ^$T/#soa[dynamic]$E, capacity: int, loc := #caller_location) -> Allocator_Error {
+non_zero_reserve_soa :: proc(array: ^$T/#soa[dynamic]$E, #any_int capacity: int, loc := #caller_location) -> Allocator_Error {
 	return _reserve_soa(array, capacity, false, loc)
 }

@@ -484,7 +484,7 @@ into_dynamic_soa :: proc(array: $T/#soa[]$E) -> #soa[dynamic]E {
 // Note: If you the elements to remain in their order, use `ordered_remove_soa`.
 // Note: If the index is out of bounds, this procedure will panic.
@builtin
-unordered_remove_soa :: proc(array: ^$T/#soa[dynamic]$E, index: int, loc := #caller_location) #no_bounds_check {
+unordered_remove_soa :: proc(array: ^$T/#soa[dynamic]$E, #any_int index: int, loc := #caller_location) #no_bounds_check {
 	bounds_check_error_loc(loc, index, len(array))
 	if index+1 < len(array) {
 		ti := type_info_of(typeid_of(T))
@@ -512,7 +512,7 @@ unordered_remove_soa :: proc(array: ^$T/#soa[dynamic]$E, index: int, loc := #cal
 // Note: If you the elements do not have to remain in their order, prefer `unordered_remove_soa`.
 // Note: If the index is out of bounds, this procedure will panic.
@builtin
-ordered_remove_soa :: proc(array: ^$T/#soa[dynamic]$E, index: int, loc := #caller_location) #no_bounds_check {
+ordered_remove_soa :: proc(array: ^$T/#soa[dynamic]$E, #any_int index: int, loc := #caller_location) #no_bounds_check {
 	bounds_check_error_loc(loc, index, len(array))
 	if index+1 < len(array) {
 		ti := type_info_of(typeid_of(T))
@@ -1,8 +1,8 @@
 package runtime

 nil_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode,
-                               size, alignment: int,
-                               old_memory: rawptr, old_size: int, loc := #caller_location) -> ([]byte, Allocator_Error) {
+                           size, alignment: int,
+                           old_memory: rawptr, old_size: int, loc := #caller_location) -> ([]byte, Allocator_Error) {
 	switch mode {
 	case .Alloc, .Alloc_Non_Zeroed:
 		return nil, .Out_Of_Memory
@@ -129,7 +129,7 @@ arena_alloc :: proc(arena: ^Arena, size, alignment: uint, loc := #caller_locatio
 	return
 }

-// `arena_init` will initialize the arena with a usuable block.
+// `arena_init` will initialize the arena with a usable block.
 // This procedure is not necessary to use the Arena as the default zero as `arena_alloc` will set things up if necessary
@(require_results)
 arena_init :: proc(arena: ^Arena, size: uint, backing_allocator: Allocator, loc := #caller_location) -> Allocator_Error {
@@ -34,6 +34,9 @@ when ODIN_BUILD_MODE == .Dynamic {
 		} else when ODIN_OS == .Darwin && ODIN_ARCH == .arm64 {
 			@require foreign import entry "entry_unix_no_crt_darwin_arm64.asm"
 			SYS_exit :: 1
+		} else when ODIN_ARCH == .riscv64 {
+			@require foreign import entry "entry_unix_no_crt_riscv64.asm"
+			SYS_exit :: 93
 		}
 		@(link_name="_start_odin", linkage="strong", require)
 		_start_odin :: proc "c" (argc: i32, argv: [^]cstring) -> ! {
@@ -0,0 +1,10 @@
+.text
+
+.globl _start
+
+_start:
+	ld a0, 0(sp)
+	addi a1, sp, 8
+	addi sp, sp, ~15
+	call _start_odin
+	ebreak
@@ -19,12 +19,15 @@ heap_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode,
 	// the pointer we return to the user.
 	//

-	aligned_alloc :: proc(size, alignment: int, old_ptr: rawptr = nil, zero_memory := true) -> ([]byte, Allocator_Error) {
+	aligned_alloc :: proc(size, alignment: int, old_ptr: rawptr, old_size: int, zero_memory := true) -> ([]byte, Allocator_Error) {
 		a := max(alignment, align_of(rawptr))
 		space := size + a - 1

 		allocated_mem: rawptr
-		if old_ptr != nil {
+
+		force_copy := old_ptr != nil && a > align_of(rawptr)
+
+		if !force_copy && old_ptr != nil {
 			original_old_ptr := ([^]rawptr)(old_ptr)[-1]
 			allocated_mem = heap_resize(original_old_ptr, space+size_of(rawptr))
 		} else {
@@ -36,12 +39,19 @@ heap_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode,
 		aligned_ptr := (ptr - 1 + uintptr(a)) & -uintptr(a)
 		diff := int(aligned_ptr - ptr)
 		if (size + diff) > space || allocated_mem == nil {
+			aligned_free(old_ptr)
+			aligned_free(allocated_mem)
 			return nil, .Out_Of_Memory
 		}

 		aligned_mem = rawptr(aligned_ptr)
 		([^]rawptr)(aligned_mem)[-1] = allocated_mem

+		if force_copy {
+			mem_copy_non_overlapping(aligned_mem, old_ptr, old_size)
+			aligned_free(old_ptr)
+		}
+
 		return byte_slice(aligned_mem, size), nil
 	}

@@ -53,10 +63,10 @@ heap_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode,

 	aligned_resize :: proc(p: rawptr, old_size: int, new_size: int, new_alignment: int, zero_memory := true) -> (new_memory: []byte, err: Allocator_Error) {
 		if p == nil {
-			return nil, nil
+			return aligned_alloc(new_size, new_alignment, nil, old_size, zero_memory)
 		}

-		new_memory = aligned_alloc(new_size, new_alignment, p, zero_memory) or_return
+		new_memory = aligned_alloc(new_size, new_alignment, p, old_size, zero_memory) or_return

 		// NOTE: heap_resize does not zero the new memory, so we do it
 		if zero_memory && new_size > old_size {
@@ -68,7 +78,7 @@ heap_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode,

 	switch mode {
 	case .Alloc, .Alloc_Non_Zeroed:
-		return aligned_alloc(size, alignment, nil, mode == .Alloc)
+		return aligned_alloc(size, alignment, nil, 0, mode == .Alloc)

 	case .Free:
 		aligned_free(old_memory)
@@ -77,9 +87,6 @@ heap_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode,
 		return nil, .Mode_Not_Implemented

 	case .Resize, .Resize_Non_Zeroed:
-		if old_memory == nil {
-			return aligned_alloc(size, alignment, nil, mode == .Resize)
-		}
 		return aligned_resize(old_memory, old_size, size, alignment, mode == .Resize)

 	case .Query_Features:
@@ -8,10 +8,9 @@ IS_WASM :: ODIN_ARCH == .wasm32 || ODIN_ARCH == .wasm64p32

@(private)
 RUNTIME_LINKAGE :: "strong" when (
-	(ODIN_USE_SEPARATE_MODULES || 
+	ODIN_USE_SEPARATE_MODULES || 
 	ODIN_BUILD_MODE == .Dynamic ||
-	!ODIN_NO_CRT) &&
-	!IS_WASM) else "internal"
+	!ODIN_NO_CRT) else "internal"
 RUNTIME_REQUIRE :: false // !ODIN_TILDE

@(private)
@@ -879,9 +878,6 @@ extendhfsf2 :: proc "c" (value: __float16) -> f32 {

@(link_name="__floattidf", linkage=RUNTIME_LINKAGE, require=RUNTIME_REQUIRE)
 floattidf :: proc "c" (a: i128) -> f64 {
-when IS_WASM {
-	return 0
-} else {
 	DBL_MANT_DIG :: 53
 	if a == 0 {
 		return 0.0
@@ -921,14 +917,10 @@ when IS_WASM {
 	fb[0] = u32(a)                           // mantissa-low
 	return transmute(f64)fb
 }
-}


@(link_name="__floattidf_unsigned", linkage=RUNTIME_LINKAGE, require=RUNTIME_REQUIRE)
 floattidf_unsigned :: proc "c" (a: u128) -> f64 {
-when IS_WASM {
-	return 0
-} else {
 	DBL_MANT_DIG :: 53
 	if a == 0 {
 		return 0.0
@@ -966,7 +958,6 @@ when IS_WASM {
 	fb[0] = u32(a)                           // mantissa-low
 	return transmute(f64)fb
 }
-}



@@ -1023,14 +1014,32 @@ modti3 :: proc "c" (a, b: i128) -> i128 {

@(link_name="__divmodti4", linkage=RUNTIME_LINKAGE, require=RUNTIME_REQUIRE)
 divmodti4 :: proc "c" (a, b: i128, rem: ^i128) -> i128 {
-	u := udivmod128(u128(a), u128(b), (^u128)(rem))
-	return i128(u)
+	s_a := a >> (128 - 1) // -1 if negative or 0
+	s_b := b >> (128 - 1)
+	an := (a ~ s_a) - s_a // absolute
+	bn := (b ~ s_b) - s_b
+
+	s_b   ~= s_a // quotient sign
+	u_s_b := u128(s_b)
+	u_s_a := u128(s_a)
+
+	r: u128 = ---
+	u := i128((udivmodti4(u128(an), u128(bn), &r) ~ u_s_b) - u_s_b) // negate if negative
+	rem^ = i128((r ~ u_s_a) - u_s_a)
+	return u
 }

@(link_name="__divti3", linkage=RUNTIME_LINKAGE, require=RUNTIME_REQUIRE)
 divti3 :: proc "c" (a, b: i128) -> i128 {
-	u := udivmodti4(u128(a), u128(b), nil)
-	return i128(u)
+	s_a := a >> (128 - 1) // -1 if negative or 0
+	s_b := b >> (128 - 1)
+	an := (a ~ s_a) - s_a // absolute
+	bn := (b ~ s_b) - s_b
+
+	s_a   ~= s_b // quotient sign
+	u_s_a := u128(s_a)
+
+	return i128((udivmodti4(u128(an), u128(bn), nil) ~ u_s_a) - u_s_a) // negate if negative
 }


@@ -12,6 +12,8 @@ _stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
 		SYS_write :: uintptr(4)
 	} else when ODIN_ARCH == .arm32 {
 		SYS_write :: uintptr(4)
+	} else when ODIN_ARCH == .riscv64 {
+		SYS_write :: uintptr(64)
 	}

 	stderr :: 2
@@ -52,3 +52,24 @@ udivti3 :: proc "c" (la, ha, lb, hb: u64) -> u128 {
 	b.lo, b.hi = lb, hb
 	return udivmodti4(a.all, b.all, nil)
 }
+
+@(link_name="__lshrti3", linkage="strong")
+__lshrti3 :: proc "c" (la, ha: u64, b: u32) -> i128 {
+	bits :: size_of(u32)*8
+
+	input, result: ti_int
+	input.lo = la
+	input.hi = ha
+
+	if b & bits != 0 {
+		result.hi = 0
+		result.lo = input.hi >> (b - bits)
+	} else if b == 0 {
+		return input.all
+	} else {
+		result.hi = input.hi >> b
+		result.lo = (input.hi << (bits - b)) | (input.lo >> b)
+	}
+
+	return result.all
+}
@@ -0,0 +1,34 @@
+package runtime
+
+Thread_Local_Cleaner :: #type proc "odin" ()
+
+@(private="file")
+thread_local_cleaners: [8]Thread_Local_Cleaner
+
+// Add a procedure that will be run at the end of a thread for the purpose of
+// deallocating state marked as `thread_local`.
+//
+// Intended to be called in an `init` procedure of a package with
+// dynamically-allocated memory that is stored in `thread_local` variables.
+add_thread_local_cleaner :: proc "contextless" (p: Thread_Local_Cleaner) {
+	for &v in thread_local_cleaners {
+		if v == nil {
+			v = p
+			return
+		}
+	}
+	panic_contextless("There are no more thread-local cleaner slots available.")
+}
+
+// Run all of the thread-local cleaner procedures.
+//
+// Intended to be called by the internals of a threading API at the end of a
+// thread's lifetime.
+run_thread_local_cleaners :: proc "odin" () {
+	for p in thread_local_cleaners {
+		if p == nil {
+			break
+		}
+		p()
+	}
+}
@@ -116,6 +116,9 @@ if %errorlevel% neq 0 goto end_of_build
 rem If the demo doesn't run for you and your CPU is more than a decade old, try -microarch:native
 if %release_mode% EQU 0 odin run examples/demo -vet -strict-style -- Hellope World

+rem Many non-compiler devs seem to run debug build but don't realize.
+if %release_mode% EQU 0 echo: & echo Debug compiler built. Note: run "build.bat release" if you want a faster, release mode compiler.
+
 del *.obj > NUL 2> NUL

 :end_of_build
@@ -23,6 +23,14 @@ error() {
 	exit 1
 }

+# Brew advises people not to add llvm to their $PATH, so try and use brew to find it.
+if [ -z "$LLVM_CONFIG" ] &&  [ -n "$(command -v brew)" ]; then
+    if   [ -n "$(command -v $(brew --prefix llvm@18)/bin/llvm-config)" ]; then LLVM_CONFIG="$(brew --prefix llvm@18)/bin/llvm-config"
+    elif [ -n "$(command -v $(brew --prefix llvm@17)/bin/llvm-config)" ]; then LLVM_CONFIG="$(brew --prefix llvm@17)/bin/llvm-config"
+    elif [ -n "$(command -v $(brew --prefix llvm@14)/bin/llvm-config)" ]; then LLVM_CONFIG="$(brew --prefix llvm@14)/bin/llvm-config"
+    fi
+fi
+
 if [ -z "$LLVM_CONFIG" ]; then
 	# darwin, linux, openbsd
 	if   [ -n "$(command -v llvm-config-18)" ]; then LLVM_CONFIG="llvm-config-18"
@@ -95,7 +103,7 @@ Linux)
 	LDFLAGS="$LDFLAGS -ldl $($LLVM_CONFIG --libs core native --system-libs --libfiles)"
 	# Copy libLLVM*.so into current directory for linking
 	# NOTE: This is needed by the Linux release pipeline!
-	cp $(readlink -f $($LLVM_CONFIG --libfiles)) ./
+	# cp $(readlink -f $($LLVM_CONFIG --libfiles)) ./
 	LDFLAGS="$LDFLAGS -Wl,-rpath=\$ORIGIN"
 	;;
 OpenBSD)
@@ -144,12 +152,17 @@ build_odin() {
 }

 run_demo() {
-	./odin run examples/demo -vet -strict-style -- Hellope World
+	if [ $# -eq 0 ] || [ "$1" = "debug" ]; then
+		./odin run examples/demo -vet -strict-style -- Hellope World
+	fi
 }

 if [ $# -eq 0 ]; then
 	build_odin debug
 	run_demo
+
+	: ${PROGRAM:=$0}
+	printf "\nDebug compiler built. Note: run \"$PROGRAM release\" or \"$PROGRAM release-native\" if you want a faster, release mode compiler.\n"
 elif [ $# -eq 1 ]; then
 	case $1 in
 	report)
@@ -144,6 +144,9 @@ buffer_grow :: proc(b: ^Buffer, n: int, loc := #caller_location) {
 }

 buffer_write_at :: proc(b: ^Buffer, p: []byte, offset: int, loc := #caller_location) -> (n: int, err: io.Error) {
+	if len(p) == 0 {
+		return 0, nil
+	}
 	b.last_read = .Invalid
 	if offset < 0 {
 		err = .Invalid_Offset
@@ -246,10 +249,13 @@ buffer_read_ptr :: proc(b: ^Buffer, ptr: rawptr, size: int) -> (n: int, err: io.
 }

 buffer_read_at :: proc(b: ^Buffer, p: []byte, offset: int) -> (n: int, err: io.Error) {
+	if len(p) == 0 {
+		return 0, nil
+	}
 	b.last_read = .Invalid

 	if uint(offset) >= len(b.buf) {
-		err = .Invalid_Offset
+		err = .EOF
 		return
 	}
 	n = copy(p, b.buf[offset:])
@@ -310,6 +316,27 @@ buffer_unread_rune :: proc(b: ^Buffer) -> io.Error {
 	return nil
 }

+buffer_seek :: proc(b: ^Buffer, offset: i64, whence: io.Seek_From) -> (i64, io.Error) {
+	abs: i64
+	switch whence {
+	case .Start:
+		abs = offset
+	case .Current:
+		abs = i64(b.off) + offset
+	case .End:
+		abs = i64(len(b.buf)) + offset
+	case:
+		return 0, .Invalid_Whence
+	}
+
+	abs_int := int(abs)
+	if abs_int < 0 {
+		return 0, .Invalid_Offset
+	}
+	b.last_read = .Invalid
+	b.off = abs_int
+	return abs, nil
+}

 buffer_read_bytes :: proc(b: ^Buffer, delim: byte) -> (line: []byte, err: io.Error) {
 	i := index_byte(b.buf[b.off:], delim)
@@ -395,14 +422,17 @@ _buffer_proc :: proc(stream_data: rawptr, mode: io.Stream_Mode, p: []byte, offse
 		return io._i64_err(buffer_write(b, p))
 	case .Write_At:
 		return io._i64_err(buffer_write_at(b, p, int(offset)))
+	case .Seek:
+		n, err = buffer_seek(b, offset, whence)
+		return
 	case .Size:
-		n = i64(buffer_capacity(b))
+		n = i64(buffer_length(b))
 		return
 	case .Destroy:
 		buffer_destroy(b)
 		return
 	case .Query:
-		return io.query_utility({.Read, .Read_At, .Write, .Write_At, .Size, .Destroy})
+		return io.query_utility({.Read, .Read_At, .Write, .Write_At, .Seek, .Size, .Destroy, .Query})
 	}
 	return 0, .Empty
 }
@@ -1,9 +1,38 @@
 package bytes

+import "base:intrinsics"
 import "core:mem"
+import "core:simd"
 import "core:unicode"
 import "core:unicode/utf8"

+when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
+	@(private)
+	SCANNER_INDICES_256 : simd.u8x32 : {
+		0,  1,  2,  3,  4,  5,  6,  7,
+		8,  9, 10, 11, 12, 13, 14, 15,
+		16, 17, 18, 19, 20, 21, 22, 23,
+		24, 25, 26, 27, 28, 29, 30, 31,
+	}
+	@(private)
+	SCANNER_SENTINEL_MAX_256: simd.u8x32 : u8(0x00)
+	@(private)
+	SCANNER_SENTINEL_MIN_256: simd.u8x32 : u8(0xff)
+	@(private)
+	SIMD_REG_SIZE_256 :: 32
+}
+@(private)
+SCANNER_INDICES_128 : simd.u8x16 : {
+	0,  1,  2,  3,  4,  5,  6,  7,
+	8,  9, 10, 11, 12, 13, 14, 15,
+}
+@(private)
+SCANNER_SENTINEL_MAX_128: simd.u8x16 : u8(0x00)
+@(private)
+SCANNER_SENTINEL_MIN_128: simd.u8x16 : u8(0xff)
+@(private)
+SIMD_REG_SIZE_128 :: 16
+
 clone :: proc(s: []byte, allocator := context.allocator, loc := #caller_location) -> []byte {
 	c := make([]byte, len(s), allocator, loc)
 	copy(c, s)
@@ -293,28 +322,277 @@ split_after_iterator :: proc(s: ^[]byte, sep: []byte) -> ([]byte, bool) {
 	return _split_iterator(s, sep, len(sep))
 }

+/*
+Scan a slice of bytes for a specific byte.

-index_byte :: proc(s: []byte, c: byte) -> int {
-	for i := 0; i < len(s); i += 1 {
+This procedure safely handles slices of any length, including empty slices.
+
+Inputs:
+- data: A slice of bytes.
+- c: The byte to search for.
+
+Returns:
+- index: The index of the byte `c`, or -1 if it was not found.
+*/
+index_byte :: proc(s: []byte, c: byte) -> (index: int) #no_bounds_check {
+	i, l := 0, len(s)
+
+	// Guard against small strings.  On modern systems, it is ALWAYS
+	// worth vectorizing assuming there is a hardware vector unit, and
+	// the data size is large enough.
+	if l < SIMD_REG_SIZE_128 {
+		for /**/; i < l; i += 1 {
+			if s[i] == c {
+				return i
+			}
+		}
+		return -1
+	}
+
+	c_vec: simd.u8x16 = c
+	when !simd.IS_EMULATED {
+		// Note: While this is something that could also logically take
+		// advantage of AVX512, the various downclocking and power
+		// consumption related woes make premature to have a dedicated
+		// code path.
+		when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
+			c_vec_256: simd.u8x32 = c
+
+			s_vecs: [4]simd.u8x32 = ---
+			c_vecs: [4]simd.u8x32 = ---
+			m_vec: [4]u8 = ---
+
+			// Scan 128-byte chunks, using 256-bit SIMD.
+			for nr_blocks := l / (4 * SIMD_REG_SIZE_256); nr_blocks > 0; nr_blocks -= 1 {
+				#unroll for j in 0..<4 {
+					s_vecs[j] = intrinsics.unaligned_load(cast(^simd.u8x32)raw_data(s[i+j*SIMD_REG_SIZE_256:]))
+					c_vecs[j] = simd.lanes_eq(s_vecs[j], c_vec_256)
+					m_vec[j] = simd.reduce_or(c_vecs[j])
+				}
+				if m_vec[0] | m_vec[1] | m_vec[2] | m_vec[3] > 0 {
+					#unroll for j in 0..<4 {
+						if m_vec[j] > 0 {
+							sel := simd.select(c_vecs[j], SCANNER_INDICES_256, SCANNER_SENTINEL_MIN_256)
+							off := simd.reduce_min(sel)
+							return i + j * SIMD_REG_SIZE_256 + int(off)
+						}
+					}
+				}
+
+				i += 4 * SIMD_REG_SIZE_256
+			}
+
+			// Scan 64-byte chunks, using 256-bit SIMD.
+			for nr_blocks := (l - i) / (2 * SIMD_REG_SIZE_256); nr_blocks > 0; nr_blocks -= 1 {
+				#unroll for j in 0..<2 {
+					s_vecs[j] = intrinsics.unaligned_load(cast(^simd.u8x32)raw_data(s[i+j*SIMD_REG_SIZE_256:]))
+					c_vecs[j] = simd.lanes_eq(s_vecs[j], c_vec_256)
+					m_vec[j] = simd.reduce_or(c_vecs[j])
+				}
+				if m_vec[0] | m_vec[1] > 0 {
+					#unroll for j in 0..<2 {
+						if m_vec[j] > 0 {
+							sel := simd.select(c_vecs[j], SCANNER_INDICES_256, SCANNER_SENTINEL_MIN_256)
+							off := simd.reduce_min(sel)
+							return i + j * SIMD_REG_SIZE_256 + int(off)
+						}
+					}
+				}
+
+				i += 2 * SIMD_REG_SIZE_256
+			}
+		} else {
+			s_vecs: [4]simd.u8x16 = ---
+			c_vecs: [4]simd.u8x16 = ---
+			m_vecs: [4]u8 = ---
+
+			// Scan 64-byte chunks, using 128-bit SIMD.
+			for nr_blocks := l / (4 * SIMD_REG_SIZE_128); nr_blocks > 0; nr_blocks -= 1 {
+				#unroll for j in 0..<4 {
+					s_vecs[j]= intrinsics.unaligned_load(cast(^simd.u8x16)raw_data(s[i+j*SIMD_REG_SIZE_128:]))
+					c_vecs[j] = simd.lanes_eq(s_vecs[j], c_vec)
+					m_vecs[j] = simd.reduce_or(c_vecs[j])
+				}
+				if m_vecs[0] | m_vecs[1] | m_vecs[2] | m_vecs[3] > 0 {
+					#unroll for j in 0..<4 {
+						if m_vecs[j] > 0 {
+							sel := simd.select(c_vecs[j], SCANNER_INDICES_128, SCANNER_SENTINEL_MIN_128)
+							off := simd.reduce_min(sel)
+							return i + j * SIMD_REG_SIZE_128 + int(off)
+						}
+					}
+				}
+
+				i += 4 * SIMD_REG_SIZE_128
+			}
+		}
+	}
+
+	// Scan the remaining SIMD register sized chunks.
+	//
+	// Apparently LLVM does ok with 128-bit SWAR, so this path is also taken
+	// on potato targets.  Scanning more at a time when LLVM is emulating SIMD
+	// likely does not buy much, as all that does is increase GP register
+	// pressure.
+	for nr_blocks := (l - i) / SIMD_REG_SIZE_128; nr_blocks > 0; nr_blocks -= 1 {
+		s0 := intrinsics.unaligned_load(cast(^simd.u8x16)raw_data(s[i:]))
+		c0 := simd.lanes_eq(s0, c_vec)
+		if simd.reduce_or(c0) > 0 {
+			sel := simd.select(c0, SCANNER_INDICES_128, SCANNER_SENTINEL_MIN_128)
+			off := simd.reduce_min(sel)
+			return i + int(off)
+		}
+
+		i += SIMD_REG_SIZE_128
+	}
+
+	// Scan serially for the remainder.
+	for /**/; i < l; i += 1 {
 		if s[i] == c {
 			return i
 		}
 	}
+
 	return -1
 }

-// Returns -1 if c is not present
-last_index_byte :: proc(s: []byte, c: byte) -> int {
-	for i := len(s)-1; i >= 0; i -= 1 {
+/*
+Scan a slice of bytes for a specific byte, starting from the end and working
+backwards to the start.
+
+This procedure safely handles slices of any length, including empty slices.
+
+Inputs:
+- data: A slice of bytes.
+- c: The byte to search for.
+
+Returns:
+- index: The index of the byte `c`, or -1 if it was not found.
+*/
+last_index_byte :: proc(s: []byte, c: byte) -> int #no_bounds_check {
+	i := len(s)
+
+	// Guard against small strings.  On modern systems, it is ALWAYS
+	// worth vectorizing assuming there is a hardware vector unit, and
+	// the data size is large enough.
+	if i < SIMD_REG_SIZE_128 {
+		#reverse for ch, j in s {
+			if ch == c {
+				return j
+			}
+		}
+		return -1
+	}
+
+	c_vec: simd.u8x16 = c
+	when !simd.IS_EMULATED {
+		// Note: While this is something that could also logically take
+		// advantage of AVX512, the various downclocking and power
+		// consumption related woes make premature to have a dedicated
+		// code path.
+		when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
+			c_vec_256: simd.u8x32 = c
+
+			s_vecs: [4]simd.u8x32 = ---
+			c_vecs: [4]simd.u8x32 = ---
+			m_vec: [4]u8 = ---
+
+			// Scan 128-byte chunks, using 256-bit SIMD.
+			for i >= 4 * SIMD_REG_SIZE_256 {
+				i -= 4 * SIMD_REG_SIZE_256
+
+				#unroll for j in 0..<4 {
+					s_vecs[j] = intrinsics.unaligned_load(cast(^simd.u8x32)raw_data(s[i+j*SIMD_REG_SIZE_256:]))
+					c_vecs[j] = simd.lanes_eq(s_vecs[j], c_vec_256)
+					m_vec[j] = simd.reduce_or(c_vecs[j])
+				}
+				if m_vec[0] | m_vec[1] | m_vec[2] | m_vec[3] > 0 {
+					#unroll for j in 0..<4 {
+						if m_vec[3-j] > 0 {
+							sel := simd.select(c_vecs[3-j], SCANNER_INDICES_256, SCANNER_SENTINEL_MAX_256)
+							off := simd.reduce_max(sel)
+							return i + (3-j) * SIMD_REG_SIZE_256 + int(off)
+						}
+					}
+				}
+			}
+
+			// Scan 64-byte chunks, using 256-bit SIMD.
+			for i >= 2 * SIMD_REG_SIZE_256 {
+				i -= 2 * SIMD_REG_SIZE_256
+
+				#unroll for j in 0..<2 {
+					s_vecs[j] = intrinsics.unaligned_load(cast(^simd.u8x32)raw_data(s[i+j*SIMD_REG_SIZE_256:]))
+					c_vecs[j] = simd.lanes_eq(s_vecs[j], c_vec_256)
+					m_vec[j] = simd.reduce_or(c_vecs[j])
+				}
+				if m_vec[0] | m_vec[1] > 0 {
+					#unroll for j in 0..<2 {
+						if m_vec[1-j] > 0 {
+							sel := simd.select(c_vecs[1-j], SCANNER_INDICES_256, SCANNER_SENTINEL_MAX_256)
+							off := simd.reduce_max(sel)
+							return i + (1-j) * SIMD_REG_SIZE_256 + int(off)
+						}
+					}
+				}
+			}
+		} else {
+			s_vecs: [4]simd.u8x16 = ---
+			c_vecs: [4]simd.u8x16 = ---
+			m_vecs: [4]u8 = ---
+
+			// Scan 64-byte chunks, using 128-bit SIMD.
+			for i >= 4 * SIMD_REG_SIZE_128 {
+				i -= 4 * SIMD_REG_SIZE_128
+
+				#unroll for j in 0..<4 {
+					s_vecs[j] = intrinsics.unaligned_load(cast(^simd.u8x16)raw_data(s[i+j*SIMD_REG_SIZE_128:]))
+					c_vecs[j] = simd.lanes_eq(s_vecs[j], c_vec)
+					m_vecs[j] = simd.reduce_or(c_vecs[j])
+				}
+				if m_vecs[0] | m_vecs[1] | m_vecs[2] | m_vecs[3] > 0 {
+					#unroll for j in 0..<4 {
+						if m_vecs[3-j] > 0 {
+							sel := simd.select(c_vecs[3-j], SCANNER_INDICES_128, SCANNER_SENTINEL_MAX_128)
+							off := simd.reduce_max(sel)
+							return i + (3-j) * SIMD_REG_SIZE_128 + int(off)
+						}
+					}
+				}
+			}
+		}
+	}
+
+	// Scan the remaining SIMD register sized chunks.
+	//
+	// Apparently LLVM does ok with 128-bit SWAR, so this path is also taken
+	// on potato targets.  Scanning more at a time when LLVM is emulating SIMD
+	// likely does not buy much, as all that does is increase GP register
+	// pressure.
+	for i >= SIMD_REG_SIZE_128 {
+		i -= SIMD_REG_SIZE_128
+
+		s0 := intrinsics.unaligned_load(cast(^simd.u8x16)raw_data(s[i:]))
+		c0 := simd.lanes_eq(s0, c_vec)
+		if simd.reduce_or(c0) > 0 {
+			sel := simd.select(c0, SCANNER_INDICES_128, SCANNER_SENTINEL_MAX_128)
+			off := simd.reduce_max(sel)
+			return i + int(off)
+		}
+	}
+
+	// Scan serially for the remainder.
+	for i > 0 {
+		i -= 1
 		if s[i] == c {
 			return i
 		}
 	}
+
 	return -1
 }


-
@private PRIME_RABIN_KARP :: 16777619

 index :: proc(s, substr: []byte) -> int {
@@ -9,10 +9,11 @@ Reader :: struct {
 	prev_rune: int,    // previous reading index of rune or < 0
 }

-reader_init :: proc(r: ^Reader, s: []byte) {
+reader_init :: proc(r: ^Reader, s: []byte) -> io.Stream {
 	r.s = s
 	r.i = 0
 	r.prev_rune = -1
+	return reader_to_stream(r)
 }

 reader_to_stream :: proc(r: ^Reader) -> (s: io.Stream) {
@@ -33,6 +34,9 @@ reader_size :: proc(r: ^Reader) -> i64 {
 }

 reader_read :: proc(r: ^Reader, p: []byte) -> (n: int, err: io.Error) {
+	if len(p) == 0 {
+		return 0, nil
+	}
 	if r.i >= i64(len(r.s)) {
 		return 0, .EOF
 	}
@@ -42,6 +46,9 @@ reader_read :: proc(r: ^Reader, p: []byte) -> (n: int, err: io.Error) {
 	return
 }
 reader_read_at :: proc(r: ^Reader, p: []byte, off: i64) -> (n: int, err: io.Error) {
+	if len(p) == 0 {
+		return 0, nil
+	}
 	if off < 0 {
 		return 0, .Invalid_Offset
 	}
@@ -97,7 +104,6 @@ reader_unread_rune :: proc(r: ^Reader) -> io.Error {
 	return nil
 }
 reader_seek :: proc(r: ^Reader, offset: i64, whence: io.Seek_From) -> (i64, io.Error) {
-	r.prev_rune = -1
 	abs: i64
 	switch whence {
 	case .Start:
@@ -114,6 +120,7 @@ reader_seek :: proc(r: ^Reader, offset: i64, whence: io.Seek_From) -> (i64, io.E
 		return 0, .Invalid_Offset
 	}
 	r.i = abs
+	r.prev_rune = -1
 	return abs, nil
 }
 reader_write_to :: proc(r: ^Reader, w: io.Writer) -> (n: i64, err: io.Error) {
@@ -47,8 +47,8 @@ foreign libc {
 	clogf   :: proc(z: complex_float) -> complex_float ---

 	// 7.3.8 Power and absolute-value functions
-	cabs    :: proc(z: complex_double) -> complex_double ---
-	cabsf   :: proc(z: complex_float) -> complex_float ---
+	cabs    :: proc(z: complex_double) -> double ---
+	cabsf   :: proc(z: complex_float) -> float ---
 	cpow    :: proc(x, y: complex_double) -> complex_double ---
 	cpowf   :: proc(x, y: complex_float) -> complex_float ---
 	csqrt   :: proc(z: complex_double) -> complex_double ---
@@ -102,6 +102,6 @@ when ODIN_OS == .Haiku {
 // read the value, or to produce an lvalue such that you can assign a different
 // error value to errno. To work around this, just expose it as a function like
 // it actually is.
-errno :: #force_inline proc() -> ^int {
+errno :: #force_inline proc "contextless" () -> ^int {
 	return _get_errno()
 }
@@ -32,24 +32,21 @@ when ODIN_OS == .Windows {
 		// the RDX register will contain zero and correctly set the flag to disable
 		// stack unwinding.
 		@(link_name="_setjmp")
-		setjmp  :: proc(env: ^jmp_buf, hack: rawptr = nil) -> int ---
+		setjmp :: proc(env: ^jmp_buf, hack: rawptr = nil) -> int ---
 	}
 } else {
 	@(default_calling_convention="c")
 	foreign libc {
 		// 7.13.1 Save calling environment
-		//
-		// NOTE(dweiler): C11 requires setjmp be a macro, which means it won't
-		// necessarily export a symbol named setjmp but rather _setjmp in the case
-		// of musl, glibc, BSD libc, and msvcrt.
-		@(link_name="_setjmp")
-		setjmp  :: proc(env: ^jmp_buf) -> int ---
+		@(link_name=LSETJMP)
+		setjmp :: proc(env: ^jmp_buf) -> int ---
 	}
 }

@(default_calling_convention="c")
 foreign libc {
 	// 7.13.2 Restore calling environment
+	@(link_name=LLONGJMP)
 	longjmp :: proc(env: ^jmp_buf, val: int) -> ! ---
 }

@@ -64,3 +61,11 @@ foreign libc {
 // The choice of 4096 bytes for storage of this type is more than enough on all
 // relevant platforms.
 jmp_buf :: struct #align(16) { _: [4096]char, }
+
+when ODIN_OS == .NetBSD {
+	@(private) LSETJMP  :: "__setjmp14"
+	@(private) LLONGJMP :: "__longjmp14"
+} else {
+	@(private) LSETJMP  :: "setjmp"
+	@(private) LLONGJMP :: "longjmp"
+}
@@ -17,6 +17,12 @@ when ODIN_OS == .Windows {

 FILE :: struct {}

+Whence :: enum int {
+	SET = SEEK_SET,
+	CUR = SEEK_CUR,
+	END = SEEK_END,
+}
+
 // MSVCRT compatible.
 when ODIN_OS == .Windows {
 	_IOFBF       :: 0x0000
@@ -101,6 +107,8 @@ when ODIN_OS == .OpenBSD || ODIN_OS == .NetBSD {
 	SEEK_CUR :: 1
 	SEEK_END :: 2

+	TMP_MAX :: 308915776
+
 	foreign libc {
 		__sF: [3]FILE
 	}
@@ -128,6 +136,8 @@ when ODIN_OS == .FreeBSD {
 	SEEK_CUR :: 1
 	SEEK_END :: 2

+	TMP_MAX :: 308915776
+
 	foreign libc {
 		@(link_name="__stderrp") stderr: ^FILE
 		@(link_name="__stdinp")  stdin:  ^FILE
@@ -195,10 +205,21 @@ when ODIN_OS == .Haiku {
 	}
 }

+when ODIN_OS == .NetBSD {
+	@(private) LRENAME  :: "__posix_rename"
+	@(private) LFGETPOS :: "__fgetpos50"
+	@(private) LFSETPOS :: "__fsetpos50"
+} else {
+	@(private) LRENAME  :: "rename"
+	@(private) LFGETPOS :: "fgetpos"
+	@(private) LFSETPOS :: "fsetpos"
+}
+
@(default_calling_convention="c")
 foreign libc {
 	// 7.21.4 Operations on files
 	remove    :: proc(filename: cstring) -> int ---
+	@(link_name=LRENAME)
 	rename    :: proc(old, new: cstring) -> int ---
 	tmpfile   :: proc() -> ^FILE ---
 	tmpnam    :: proc(s: [^]char) -> [^]char ---
@@ -240,8 +261,10 @@ foreign libc {
 	fwrite    :: proc(ptr: rawptr, size: size_t, nmemb: size_t, stream: ^FILE) -> size_t ---

 	// 7.21.9 File positioning functions
+	@(link_name=LFGETPOS)
 	fgetpos   :: proc(stream: ^FILE, pos: ^fpos_t) -> int ---
-	fseek     :: proc(stream: ^FILE, offset: long, whence: int) -> int ---
+	fseek     :: proc(stream: ^FILE, offset: long, whence: Whence) -> int ---
+	@(link_name=LFSETPOS)
 	fsetpos   :: proc(stream: ^FILE, pos: ^fpos_t) -> int ---
 	ftell     :: proc(stream: ^FILE) -> long ---
 	rewind    :: proc(stream: ^FILE) ---
@@ -288,11 +311,11 @@ to_stream :: proc(file: ^FILE) -> io.Stream {
 				return 0, unknown_or_eof(file)
 			}

-			if fseek(file, long(offset), SEEK_SET) != 0 {
+			if fseek(file, long(offset), .SET) != 0 {
 				return 0, unknown_or_eof(file)
 			}

-			defer fseek(file, long(curr), SEEK_SET)
+			defer fseek(file, long(curr), .SET)

 			n = i64(fread(raw_data(p), size_of(byte), len(p), file))
 			if n == 0 { err = unknown_or_eof(file) }
@@ -307,17 +330,21 @@ to_stream :: proc(file: ^FILE) -> io.Stream {
 				return 0, unknown_or_eof(file)
 			}

-			if fseek(file, long(offset), SEEK_SET) != 0 {
+			if fseek(file, long(offset), .SET) != 0 {
 				return 0, unknown_or_eof(file)
 			}

-			defer fseek(file, long(curr), SEEK_SET)
+			defer fseek(file, long(curr), .SET)

 			n = i64(fwrite(raw_data(p), size_of(byte), len(p), file))
 			if n == 0 { err = unknown_or_eof(file) }

 		case .Seek:
-			if fseek(file, long(offset), int(whence)) != 0 {
+			#assert(int(Whence.SET) == int(io.Seek_From.Start))
+			#assert(int(Whence.CUR) == int(io.Seek_From.Current))
+			#assert(int(Whence.END) == int(io.Seek_From.End))
+
+			if fseek(file, long(offset), Whence(whence)) != 0 {
 				return 0, unknown_or_eof(file)
 			}
 		
@@ -326,9 +353,9 @@ to_stream :: proc(file: ^FILE) -> io.Stream {
 			if curr == -1 {
 				return 0, unknown_or_eof(file)
 			}
-			defer fseek(file, curr, SEEK_SET)
+			defer fseek(file, curr, .SET)

-			if fseek(file, 0, SEEK_END) != 0 {
+			if fseek(file, 0, .END) != 0 {
 				return 0, unknown_or_eof(file)
 			}

@@ -341,7 +368,7 @@ to_stream :: proc(file: ^FILE) -> io.Stream {
 			return 0, .Empty
 		
 		case .Query:
-			return io.query_utility({ .Close, .Flush, .Read, .Read_At, .Write, .Write_At, .Seek, .Size })
+			return io.query_utility({ .Close, .Flush, .Read, .Read_At, .Write, .Write_At, .Seek, .Size, .Query })
 		}
 		return
 	}
@@ -40,10 +40,9 @@ when ODIN_OS == .Linux {
 }


-when ODIN_OS == .Darwin {
+when ODIN_OS == .Darwin || ODIN_OS == .FreeBSD || ODIN_OS == .OpenBSD {
 	RAND_MAX :: 0x7fffffff

-	// GLIBC and MUSL only
 	@(private="file")
 	@(default_calling_convention="c")
 	foreign libc {
@@ -55,6 +54,20 @@ when ODIN_OS == .Darwin {
 	}
 }

+when ODIN_OS == .NetBSD {
+	RAND_MAX :: 0x7fffffff
+
+	@(private="file")
+	@(default_calling_convention="c")
+	foreign libc {
+		__mb_cur_max: size_t
+	}
+
+	MB_CUR_MAX :: #force_inline proc() -> size_t {
+		return __mb_cur_max
+	}
+}
+
 // C does not declare what these values should be, as an implementation is free
 // to use any two distinct values it wants to indicate success or failure.
 // However, nobody actually does and everyone appears to have agreed upon these
@@ -99,7 +112,7 @@ foreign libc {
 	at_quick_exit :: proc(func: proc "c" ()) -> int ---
 	exit          :: proc(status: int) -> ! ---
 	_Exit         :: proc(status: int) -> ! ---
-	getenv        :: proc(name: cstring) -> [^]char ---
+	getenv        :: proc(name: cstring) -> cstring ---
 	quick_exit    :: proc(status: int) -> ! ---
 	system        :: proc(cmd: cstring) -> int ---

@@ -150,4 +163,4 @@ aligned_free :: #force_inline proc "c" (ptr: rawptr) {
 	} else {
 		free(ptr)
 	}
-}
+}
@@ -40,7 +40,7 @@ foreign libc {
 	strtok   :: proc(s1: [^]char, s2: cstring) -> [^]char ---

 	// 7.24.6 Miscellaneous functions
-	strerror :: proc(errnum: int) -> [^]char ---
+	strerror :: proc(errnum: int) -> cstring ---
 	strlen   :: proc(s: cstring) -> size_t ---
 }
 memset :: proc "c" (s: rawptr, c: int, n: size_t) -> rawptr {
@@ -50,30 +50,56 @@ when ODIN_OS == .Linux || ODIN_OS == .FreeBSD || ODIN_OS == .Darwin || ODIN_OS =
 	foreign libc {
 		// 7.27.2 Time manipulation functions
 		clock        :: proc() -> clock_t ---
+		@(link_name=LDIFFTIME)
 		difftime     :: proc(time1, time2: time_t) -> double ---
+		@(link_name=LMKTIME)
 		mktime       :: proc(timeptr: ^tm) -> time_t ---
+		@(link_name=LTIME)
 		time         :: proc(timer: ^time_t) -> time_t ---
 		timespec_get :: proc(ts: ^timespec, base: int) -> int ---

 		// 7.27.3 Time conversion functions
 		asctime      :: proc(timeptr: ^tm) -> [^]char ---
+		@(link_name=LCTIME)
 		ctime        :: proc(timer: ^time_t) -> [^]char ---
+		@(link_name=LGMTIME)
 		gmtime       :: proc(timer: ^time_t) -> ^tm ---
+		@(link_name=LLOCALTIME)
 		localtime    :: proc(timer: ^time_t) -> ^tm ---
 		strftime     :: proc(s: [^]char, maxsize: size_t, format: cstring, timeptr: ^tm) -> size_t ---
 	}

+	when ODIN_OS == .NetBSD {
+		@(private) LDIFFTIME  :: "__difftime50"
+		@(private) LMKTIME    :: "__mktime50"
+		@(private) LTIME      :: "__time50"
+		@(private) LCTIME     :: "__ctime50"
+		@(private) LGMTIME    :: "__gmtime50"
+		@(private) LLOCALTIME :: "__localtime50"
+	} else {
+		@(private) LDIFFTIME  :: "difftime"
+		@(private) LMKTIME    :: "mktime"
+		@(private) LTIME      :: "time"
+		@(private) LCTIME     :: "ctime"
+		@(private) LGMTIME    :: "gmtime"
+		@(private) LLOCALTIME :: "localtime"
+	}
+
 	when ODIN_OS == .OpenBSD {
 		CLOCKS_PER_SEC :: 100
 	} else {
 		CLOCKS_PER_SEC :: 1000000
 	}

-	TIME_UTC       :: 1
+	TIME_UTC :: 1

-	time_t         :: distinct i64
+	time_t :: distinct i64

-	clock_t        :: long
+	when ODIN_OS == .FreeBSD || ODIN_OS == .NetBSD {
+		clock_t :: distinct int32_t
+	} else {
+		clock_t :: distinct long
+	}

 	timespec :: struct {
 		tv_sec:  time_t,
@@ -0,0 +1,90 @@
+/*
+	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
+	Made available under Odin's BSD-3 license.
+
+	List of contributors:
+		Jeroen van Rijn: Initial implementation.
+		Ginger Bill:     Cosmetic changes.
+
+	A small GZIP implementation as an example.
+*/
+
+/*
+Example:
+	import "core:bytes"
+	import "core:os"
+	import "core:compress"
+	import "core:fmt"
+
+	// Small GZIP file with fextra, fname and fcomment present.
+	@private
+	TEST: []u8 = {
+		0x1f, 0x8b, 0x08, 0x1c, 0xcb, 0x3b, 0x3a, 0x5a,
+		0x02, 0x03, 0x07, 0x00, 0x61, 0x62, 0x03, 0x00,
+		0x63, 0x64, 0x65, 0x66, 0x69, 0x6c, 0x65, 0x6e,
+		0x61, 0x6d, 0x65, 0x00, 0x54, 0x68, 0x69, 0x73,
+		0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x63, 0x6f,
+		0x6d, 0x6d, 0x65, 0x6e, 0x74, 0x00, 0x2b, 0x48,
+		0xac, 0xcc, 0xc9, 0x4f, 0x4c, 0x01, 0x00, 0x15,
+		0x6a, 0x2c, 0x42, 0x07, 0x00, 0x00, 0x00,
+	}
+
+	main :: proc() {
+		// Set up output buffer.
+		buf := bytes.Buffer{}
+
+		stdout :: proc(s: string) {
+			os.write_string(os.stdout, s)
+		}
+		stderr :: proc(s: string) {
+			os.write_string(os.stderr, s)
+		}
+
+		args := os.args
+
+		if len(args) < 2 {
+			stderr("No input file specified.\n")
+			err := load(data=TEST, buf=&buf, known_gzip_size=len(TEST))
+			if err == nil {
+				stdout("Displaying test vector: ")
+				stdout(bytes.buffer_to_string(&buf))
+				stdout("\n")
+			} else {
+				fmt.printf("gzip.load returned %v\n", err)
+			}
+			bytes.buffer_destroy(&buf)
+			os.exit(0)
+		}
+
+		// The rest are all files.
+		args = args[1:]
+		err: Error
+
+		for file in args {
+			if file == "-" {
+				// Read from stdin
+				s := os.stream_from_handle(os.stdin)
+				ctx := &compress.Context_Stream_Input{
+					input = s,
+				}
+				err = load(ctx, &buf)
+			} else {
+				err = load(file, &buf)
+			}
+			if err != nil {
+				if err != E_General.File_Not_Found {
+					stderr("File not found: ")
+					stderr(file)
+					stderr("\n")
+					os.exit(1)
+				}
+				stderr("GZIP returned an error.\n")
+					bytes.buffer_destroy(&buf)
+				os.exit(2)
+			}
+			stdout(bytes.buffer_to_string(&buf))
+		}
+		bytes.buffer_destroy(&buf)
+	}
+*/
+package compress_gzip
@@ -1,89 +0,0 @@
-//+build ignore
-package compress_gzip
-
-/*
-	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
-	Made available under Odin's BSD-3 license.
-
-	List of contributors:
-		Jeroen van Rijn: Initial implementation.
-		Ginger Bill:     Cosmetic changes.
-
-	A small GZIP implementation as an example.
-*/
-
-import "core:bytes"
-import "core:os"
-import "core:compress"
-import "core:fmt"
-
-// Small GZIP file with fextra, fname and fcomment present.
-@private
-TEST: []u8 = {
-	0x1f, 0x8b, 0x08, 0x1c, 0xcb, 0x3b, 0x3a, 0x5a,
-	0x02, 0x03, 0x07, 0x00, 0x61, 0x62, 0x03, 0x00,
-	0x63, 0x64, 0x65, 0x66, 0x69, 0x6c, 0x65, 0x6e,
-	0x61, 0x6d, 0x65, 0x00, 0x54, 0x68, 0x69, 0x73,
-	0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x63, 0x6f,
-	0x6d, 0x6d, 0x65, 0x6e, 0x74, 0x00, 0x2b, 0x48,
-	0xac, 0xcc, 0xc9, 0x4f, 0x4c, 0x01, 0x00, 0x15,
-	0x6a, 0x2c, 0x42, 0x07, 0x00, 0x00, 0x00,
-}
-
-main :: proc() {
-	// Set up output buffer.
-	buf := bytes.Buffer{}
-
-	stdout :: proc(s: string) {
-		os.write_string(os.stdout, s)
-	}
-	stderr :: proc(s: string) {
-		os.write_string(os.stderr, s)
-	}
-
-	args := os.args
-
-	if len(args) < 2 {
-		stderr("No input file specified.\n")
-		err := load(data=TEST, buf=&buf, known_gzip_size=len(TEST))
-		if err == nil {
-			stdout("Displaying test vector: ")
-			stdout(bytes.buffer_to_string(&buf))
-			stdout("\n")
-		} else {
-			fmt.printf("gzip.load returned %v\n", err)
-		}
-		bytes.buffer_destroy(&buf)
-		os.exit(0)
-	}
-
-	// The rest are all files.
-	args = args[1:]
-	err: Error
-
-	for file in args {
-		if file == "-" {
-			// Read from stdin
-			s := os.stream_from_handle(os.stdin)
-			ctx := &compress.Context_Stream_Input{
-				input = s,
-			}
-			err = load(ctx, &buf)
-		} else {
-			err = load(file, &buf)
-		}
-		if err != nil {
-			if err != E_General.File_Not_Found {
-				stderr("File not found: ")
-				stderr(file)
-				stderr("\n")
-				os.exit(1)
-			}
-			stderr("GZIP returned an error.\n")
-				bytes.buffer_destroy(&buf)
-			os.exit(2)
-		}
-		stdout(bytes.buffer_to_string(&buf))
-	}
-	bytes.buffer_destroy(&buf)
-}
@@ -4,7 +4,6 @@
 	which is an English word model.
 */

-// package shoco is an implementation of the shoco short string compressor
 package compress_shoco

 DEFAULT_MODEL :: Shoco_Model {
@@ -145,4 +144,4 @@ DEFAULT_MODEL :: Shoco_Model {
 		{ 0xc0000000, 2, 4, { 25, 22, 19, 16, 16, 16, 16, 16 }, { 15,  7,  7,  7, 0, 0, 0, 0 }, 0xe0, 0xc0 },
 		{ 0xe0000000, 4, 8, { 23, 19, 15, 11,  8,  5,  2,  0 }, { 31, 15, 15, 15, 7, 7, 7, 3 }, 0xf0, 0xe0 },
 	},
-}
+}
@@ -8,7 +8,7 @@
 	An implementation of [shoco](https://github.com/Ed-von-Schleck/shoco) by Christian Schramm.
 */

-// package shoco is an implementation of the shoco short string compressor
+// package shoco is an implementation of the shoco short string compressor.
 package compress_shoco

 import "base:intrinsics"
@@ -308,4 +308,4 @@ compress_string :: proc(input: string, model := DEFAULT_MODEL, allocator := cont
 	resize(&buf, length) or_return
 	return buf[:length], result
 }
-compress :: proc{compress_string_to_buffer, compress_string}
+compress :: proc{compress_string_to_buffer, compress_string}
@@ -0,0 +1,50 @@
+/*
+	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
+	Made available under Odin's BSD-3 license.
+
+	List of contributors:
+		Jeroen van Rijn: Initial implementation.
+
+	An example of how to use `zlib.inflate`.
+*/
+
+/*
+Example:
+	package main
+
+	import "core:bytes"
+	import "core:fmt"
+
+	main :: proc() {
+		ODIN_DEMO := []u8{
+			120, 218, 101, 144,  65, 110, 131,  48,  16,  69, 215, 246,  41, 190,  44,  69,  73,  32, 148, 182,
+			 75,  75,  28,  32, 251,  46, 217,  88, 238,   0,  86, 192,  32, 219,  36, 170, 170, 172, 122, 137,
+			238, 122, 197,  30, 161,  70, 162,  20,  81, 203, 139,  25, 191, 255, 191,  60,  51,  40, 125,  81,
+			 53,  33, 144,  15, 156, 155, 110, 232,  93, 128, 208, 189,  35,  89, 117,  65, 112, 222,  41,  99,
+			 33,  37,   6, 215, 235, 195,  17, 239, 156, 197, 170, 118, 170, 131,  44,  32,  82, 164,  72, 240,
+			253, 245, 249, 129,  12, 185, 224,  76, 105,  61, 118,  99, 171,  66, 239,  38, 193,  35, 103,  85,
+			172,  66, 127,  33, 139,  24, 244, 235, 141,  49, 204, 223,  76, 208, 205, 204, 166,   7, 173,  60,
+			 97, 159, 238,  37, 214,  41, 105, 129, 167,   5, 102,  27, 152, 173,  97, 178, 129,  73, 129, 231,
+			  5, 230,  27, 152, 175, 225,  52, 192, 127, 243, 170, 157, 149,  18, 121, 142, 115, 109, 227, 122,
+			 64,  87, 114, 111, 161,  49, 182,   6, 181, 158, 162, 226, 206, 167,  27, 215, 246,  48,  56,  99,
+			 67, 117,  16,  47,  13,  45,  35, 151,  98, 231,  75,   1, 173,  90,  61, 101, 146,  71, 136, 244,
+			170, 218, 145, 176, 123,  45, 173,  56, 113, 134, 191,  51, 219,  78, 235,  95,  28, 249, 253,   7,
+			159, 150, 133, 125,
+		}
+		OUTPUT_SIZE :: 432
+
+		buf: bytes.Buffer
+
+		// We can pass ", true" to inflate a raw DEFLATE stream instead of a ZLIB wrapped one.
+		err := inflate(input=ODIN_DEMO, buf=&buf, expected_output_size=OUTPUT_SIZE)
+		defer bytes.buffer_destroy(&buf)
+
+		if err != nil {
+			fmt.printf("\nError: %v\n", err)
+		}
+		s := bytes.buffer_to_string(&buf)
+		fmt.printf("Input: %v bytes, output (%v bytes):\n%v\n", len(ODIN_DEMO), len(s), s)
+		assert(len(s) == OUTPUT_SIZE)
+	}
+*/
+package compress_zlib
@@ -1,47 +0,0 @@
-//+build ignore
-package compress_zlib
-
-/*
-	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
-	Made available under Odin's BSD-3 license.
-
-	List of contributors:
-		Jeroen van Rijn: Initial implementation.
-
-	An example of how to use `zlib.inflate`.
-*/
-
-import "core:bytes"
-import "core:fmt"
-
-main :: proc() {
-	ODIN_DEMO := []u8{
-		120, 218, 101, 144,  65, 110, 131,  48,  16,  69, 215, 246,  41, 190,  44,  69,  73,  32, 148, 182,
-		 75,  75,  28,  32, 251,  46, 217,  88, 238,   0,  86, 192,  32, 219,  36, 170, 170, 172, 122, 137,
-		238, 122, 197,  30, 161,  70, 162,  20,  81, 203, 139,  25, 191, 255, 191,  60,  51,  40, 125,  81,
-		 53,  33, 144,  15, 156, 155, 110, 232,  93, 128, 208, 189,  35,  89, 117,  65, 112, 222,  41,  99,
-		 33,  37,   6, 215, 235, 195,  17, 239, 156, 197, 170, 118, 170, 131,  44,  32,  82, 164,  72, 240,
-		253, 245, 249, 129,  12, 185, 224,  76, 105,  61, 118,  99, 171,  66, 239,  38, 193,  35, 103,  85,
-		172,  66, 127,  33, 139,  24, 244, 235, 141,  49, 204, 223,  76, 208, 205, 204, 166,   7, 173,  60,
-		 97, 159, 238,  37, 214,  41, 105, 129, 167,   5, 102,  27, 152, 173,  97, 178, 129,  73, 129, 231,
-		  5, 230,  27, 152, 175, 225,  52, 192, 127, 243, 170, 157, 149,  18, 121, 142, 115, 109, 227, 122,
-		 64,  87, 114, 111, 161,  49, 182,   6, 181, 158, 162, 226, 206, 167,  27, 215, 246,  48,  56,  99,
-		 67, 117,  16,  47,  13,  45,  35, 151,  98, 231,  75,   1, 173,  90,  61, 101, 146,  71, 136, 244,
-		170, 218, 145, 176, 123,  45, 173,  56, 113, 134, 191,  51, 219,  78, 235,  95,  28, 249, 253,   7,
-		159, 150, 133, 125,
-	}
-	OUTPUT_SIZE :: 432
-
-	buf: bytes.Buffer
-
-	// We can pass ", true" to inflate a raw DEFLATE stream instead of a ZLIB wrapped one.
-	err := inflate(input=ODIN_DEMO, buf=&buf, expected_output_size=OUTPUT_SIZE)
-	defer bytes.buffer_destroy(&buf)
-
-	if err != nil {
-		fmt.printf("\nError: %v\n", err)
-	}
-	s := bytes.buffer_to_string(&buf)
-	fmt.printf("Input: %v bytes, output (%v bytes):\n%v\n", len(ODIN_DEMO), len(s), s)
-	assert(len(s) == OUTPUT_SIZE)
-}
@@ -12,6 +12,7 @@ package compress_zlib

 import "core:compress"

+import "base:intrinsics"
 import "core:mem"
 import "core:io"
 import "core:hash"
@@ -123,13 +124,7 @@ Huffman_Table :: struct {
@(optimization_mode="favor_size")
 z_bit_reverse :: #force_inline proc(n: u16, bits: u8) -> (r: u16) {
 	assert(bits <= 16)
-	// NOTE: Can optimize with llvm.bitreverse.i64 or some bit twiddling
-	// by reversing all of the bits and masking out the unneeded ones.
-	r = n
-	r = ((r & 0xAAAA) >>  1) | ((r & 0x5555) << 1)
-	r = ((r & 0xCCCC) >>  2) | ((r & 0x3333) << 2)
-	r = ((r & 0xF0F0) >>  4) | ((r & 0x0F0F) << 4)
-	r = ((r & 0xFF00) >>  8) | ((r & 0x00FF) << 8)
+	r = intrinsics.reverse_bits(n)

 	r >>= (16 - bits)
 	return
@@ -1,5 +1,6 @@
 package container_dynamic_bit_array

+import "base:builtin"
 import "base:intrinsics"
 import "core:mem"

@@ -18,7 +19,7 @@ NUM_BITS :: 64
 Bit_Array :: struct {
 	bits:         [dynamic]u64,
 	bias:         int,
-	max_index:    int,
+	length:       int,
 	free_pointer: bool,
 }

@@ -52,9 +53,9 @@ Returns:
 */
 iterate_by_all :: proc (it: ^Bit_Array_Iterator) -> (set: bool, index: int, ok: bool) {
 	index = it.word_idx * NUM_BITS + int(it.bit_idx) + it.array.bias
-	if index > it.array.max_index { return false, 0, false }
+	if index >= it.array.length + it.array.bias { return false, 0, false }

-	word := it.array.bits[it.word_idx] if len(it.array.bits) > it.word_idx else 0
+	word := it.array.bits[it.word_idx] if builtin.len(it.array.bits) > it.word_idx else 0
 	set = (word >> it.bit_idx & 1) == 1

 	it.bit_idx += 1
@@ -106,22 +107,22 @@ Returns:
 */
@(private="file")
 iterate_internal_ :: proc (it: ^Bit_Array_Iterator, $ITERATE_SET_BITS: bool) -> (index: int, ok: bool) {
-	word := it.array.bits[it.word_idx] if len(it.array.bits) > it.word_idx else 0
+	word := it.array.bits[it.word_idx] if builtin.len(it.array.bits) > it.word_idx else 0
 	when ! ITERATE_SET_BITS { word = ~word }

 	// If the word is empty or we have already gone over all the bits in it,
 	// b.bit_idx is greater than the index of any set bit in the word,
 	// meaning that word >> b.bit_idx == 0.
-	for it.word_idx < len(it.array.bits) && word >> it.bit_idx == 0 {
+	for it.word_idx < builtin.len(it.array.bits) && word >> it.bit_idx == 0 {
 		it.word_idx += 1
 		it.bit_idx = 0
-		word = it.array.bits[it.word_idx] if len(it.array.bits) > it.word_idx else 0
+		word = it.array.bits[it.word_idx] if builtin.len(it.array.bits) > it.word_idx else 0
 		when ! ITERATE_SET_BITS { word = ~word }
 	}

 	// If we are iterating the set bits, reaching the end of the array means we have no more bits to check
 	when ITERATE_SET_BITS {
-		if it.word_idx >= len(it.array.bits) {
+		if it.word_idx >= builtin.len(it.array.bits) {
 			return 0, false
 		}
 	}
@@ -135,7 +136,7 @@ iterate_internal_ :: proc (it: ^Bit_Array_Iterator, $ITERATE_SET_BITS: bool) ->
 		it.bit_idx = 0
 		it.word_idx += 1
 	}
-	return index, index <= it.array.max_index
+	return index, index < it.array.length + it.array.bias
 }
 /*
 Gets the state of a bit in the bit-array
@@ -160,7 +161,7 @@ get :: proc(ba: ^Bit_Array, #any_int index: uint) -> (res: bool, ok: bool) #opti
 		If we `get` a bit that doesn't fit in the Bit Array, it's naturally `false`.
 		This early-out prevents unnecessary resizing.
 	*/
-	if leg_index + 1 > len(ba.bits) { return false, true }
+	if leg_index + 1 > builtin.len(ba.bits) { return false, true }

 	val := u64(1 << uint(bit_index))
 	res = ba.bits[leg_index] & val == val
@@ -208,7 +209,7 @@ set :: proc(ba: ^Bit_Array, #any_int index: uint, set_to: bool = true, allocator

 	resize_if_needed(ba, leg_index) or_return

-	ba.max_index = max(idx, ba.max_index)
+	ba.length = max(1 + idx, ba.length)

 	if set_to {
 		ba.bits[leg_index] |=  1 << uint(bit_index)
@@ -261,6 +262,9 @@ unsafe_unset :: proc(b: ^Bit_Array, bit: int) #no_bounds_check {
 /*
 A helper function to create a Bit Array with optional bias, in case your smallest index is non-zero (including negative).

+The range of bits created by this procedure is `min_index..<max_index`, and the
+array will be able to expand beyond `max_index` if needed.
+
 *Allocates (`new(Bit_Array) & make(ba.bits)`)*

 Inputs:
@@ -275,7 +279,7 @@ create :: proc(max_index: int, min_index: int = 0, allocator := context.allocato
 	context.allocator = allocator
 	size_in_bits := max_index - min_index

-	if size_in_bits < 1 { return {}, false }
+	if size_in_bits < 0 { return {}, false }

 	legs := size_in_bits >> INDEX_SHIFT
 	if size_in_bits & INDEX_MASK > 0 {legs+=1}
@@ -284,7 +288,7 @@ create :: proc(max_index: int, min_index: int = 0, allocator := context.allocato
 	res = new(Bit_Array)
 	res.bits         = bits
 	res.bias         = min_index
-	res.max_index    = max_index
+	res.length       = max_index - min_index
 	res.free_pointer = true
 	return
 }
@@ -299,6 +303,48 @@ clear :: proc(ba: ^Bit_Array) {
 	mem.zero_slice(ba.bits[:])
 }
 /*
+Gets the length of set and unset valid bits in the Bit_Array.
+
+Inputs:
+- ba: The target Bit_Array
+
+Returns:
+- length: The length of valid bits.
+*/
+len :: proc(ba: ^Bit_Array) -> (length: int) {
+	if ba == nil { return }
+	return ba.length
+}
+/*
+Shrinks the Bit_Array's backing storage to the smallest possible size.
+
+Inputs:
+- ba: The target Bit_Array
+*/
+shrink :: proc(ba: ^Bit_Array) #no_bounds_check {
+	if ba == nil { return }
+	legs_needed := builtin.len(ba.bits)
+	for i := legs_needed - 1; i >= 0; i -= 1 {
+		if ba.bits[i] == 0 {
+			legs_needed -= 1
+		} else {
+			break
+		}
+	}
+	if legs_needed == builtin.len(ba.bits) {
+		return
+	}
+	ba.length = 0
+	if legs_needed > 0 {
+		if legs_needed > 1 {
+			ba.length = (legs_needed - 1) * NUM_BITS
+		}
+		ba.length += NUM_BITS - int(intrinsics.count_leading_zeros(ba.bits[legs_needed - 1]))
+	}
+	resize(&ba.bits, legs_needed)
+	builtin.shrink(&ba.bits)
+}
+/*
 Deallocates the Bit_Array and its backing storage

 Inputs:
@@ -321,8 +367,8 @@ resize_if_needed :: proc(ba: ^Bit_Array, legs: int, allocator := context.allocat

 	context.allocator = allocator

-	if legs + 1 > len(ba.bits) {
+	if legs + 1 > builtin.len(ba.bits) {
 		resize(&ba.bits, legs + 1)
 	}
-	return len(ba.bits) > legs
+	return builtin.len(ba.bits) > legs
 }
@@ -1,8 +1,8 @@
 /*
 The Bit Array can be used in several ways:

- By default you don't need to instantiate a Bit Array:
-
+By default you don't need to instantiate a Bit Array.
+Example:
 	package test

 	import "core:fmt"
@@ -22,8 +22,8 @@ The Bit Array can be used in several ways:
 		destroy(&bits)
 	}

- A Bit Array can optionally allow for negative indices, if the minimum value was given during creation:
-
+A Bit Array can optionally allow for negative indices, if the minimum value was given during creation.
+Example:
 	package test

 	import "core:fmt"
@@ -1,22 +1,22 @@
 /*
 Package list implements an intrusive doubly-linked list.

-An intrusive container requires a `Node` to be embedded in your own structure, like this:
-
+An intrusive container requires a `Node` to be embedded in your own structure, like this.
+Example:
 	My_String :: struct {
 		node:  list.Node,
 		value: string,
 	}

-Embedding the members of a `list.Node` in your structure with the `using` keyword is also allowed:
-
+Embedding the members of a `list.Node` in your structure with the `using` keyword is also allowed.
+Example:
 	My_String :: struct {
 		using node: list.Node,
 		value: string,
 	}

-Here is a full example:
-
+Here is a full example.
+Example:
 	package test
 	
 	import "core:fmt"
@@ -42,5 +42,8 @@ Here is a full example:
 	    value: string,
 	}

+Output:
+	Hello
+	World
 */
 package container_intrusive_list
@@ -139,9 +139,13 @@ clear :: proc "contextless" (a: ^$A/Small_Array($N, $T)) {
 	resize(a, 0)
 }

-push_back_elems :: proc "contextless" (a: ^$A/Small_Array($N, $T), items: ..T) {
-	n := copy(a.data[a.len:], items[:])
-	a.len += n
+push_back_elems :: proc "contextless" (a: ^$A/Small_Array($N, $T), items: ..T) -> bool {
+	if a.len + builtin.len(items) <= cap(a^) {
+		n := copy(a.data[a.len:], items[:])
+		a.len += n
+		return true
+	}
+	return false
 }

 inject_at :: proc "contextless" (a: ^$A/Small_Array($N, $T), item: T, index: int) -> bool #no_bounds_check {
@@ -80,8 +80,8 @@ ghash :: proc "contextless" (dst, key, data: []byte) {
 	h2 := h0 ~ h1
 	h2r := h0r ~ h1r

-	src: []byte
 	for l > 0 {
+		src: []byte = ---
 		if l >= _aes.GHASH_BLOCK_SIZE {
 			src = buf
 			buf = buf[_aes.GHASH_BLOCK_SIZE:]
@@ -3,7 +3,7 @@ package aes_hw_intel

 import "core:sys/info"

-// is_supporte returns true iff hardware accelerated AES
+// is_supported returns true iff hardware accelerated AES
 // is supported.
 is_supported :: proc "contextless" () -> bool {
 	features, ok := info.cpu_features.?
@@ -25,7 +25,6 @@ package aes_hw_intel

 import "base:intrinsics"
 import "core:crypto/_aes"
-import "core:simd"
 import "core:simd/x86"

@(private = "file")
@@ -58,14 +57,11 @@ GHASH_STRIDE_BYTES_HW :: GHASH_STRIDE_HW * _aes.GHASH_BLOCK_SIZE
 // chunks. We number chunks from 0 to 3 in left to right order.

@(private = "file")
-byteswap_index := transmute(x86.__m128i)simd.i8x16{
-	// Note: simd.i8x16 is reverse order from x86._mm_set_epi8.
-	15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-}
+_BYTESWAP_INDEX: x86.__m128i : { 0x08090a0b0c0d0e0f, 0x0001020304050607 }

@(private = "file", require_results, enable_target_feature = "sse2,ssse3")
 byteswap :: #force_inline proc "contextless" (x: x86.__m128i) -> x86.__m128i {
-	return x86._mm_shuffle_epi8(x, byteswap_index)
+	return x86._mm_shuffle_epi8(x, _BYTESWAP_INDEX)
 }

 // From a 128-bit value kw, compute kx as the XOR of the two 64-bit
@@ -244,8 +240,8 @@ ghash :: proc "contextless" (dst, key, data: []byte) #no_bounds_check {
 	}

 	// Process 1 block at a time
-	src: []byte
 	for l > 0 {
+		src: []byte = ---
 		if l >= _aes.GHASH_BLOCK_SIZE {
 			src = buf
 			buf = buf[_aes.GHASH_BLOCK_SIZE:]
@@ -0,0 +1,123 @@
+package _chacha20
+
+import "base:intrinsics"
+import "core:encoding/endian"
+import "core:math/bits"
+import "core:mem"
+
+// KEY_SIZE is the (X)ChaCha20 key size in bytes.
+KEY_SIZE :: 32
+// IV_SIZE is the ChaCha20 IV size in bytes.
+IV_SIZE :: 12
+// XIV_SIZE is the XChaCha20 IV size in bytes.
+XIV_SIZE :: 24
+
+// MAX_CTR_IETF is the maximum counter value for the IETF flavor ChaCha20.
+MAX_CTR_IETF :: 0xffffffff
+// BLOCK_SIZE is the (X)ChaCha20 block size in bytes.
+BLOCK_SIZE :: 64
+// STATE_SIZE_U32 is the (X)ChaCha20 state size in u32s.
+STATE_SIZE_U32 :: 16
+// Rounds is the (X)ChaCha20 round count.
+ROUNDS :: 20
+
+// SIGMA_0 is sigma[0:4].
+SIGMA_0: u32 : 0x61707865
+// SIGMA_1 is sigma[4:8].
+SIGMA_1: u32 : 0x3320646e
+// SIGMA_2 is sigma[8:12].
+SIGMA_2: u32 : 0x79622d32
+// SIGMA_3 is sigma[12:16].
+SIGMA_3: u32 : 0x6b206574
+
+// Context is a ChaCha20 or XChaCha20 instance.
+Context :: struct {
+	_s:              [STATE_SIZE_U32]u32,
+	_buffer:         [BLOCK_SIZE]byte,
+	_off:            int,
+	_is_ietf_flavor: bool,
+	_is_initialized: bool,
+}
+
+// init inititializes a Context for ChaCha20 with the provided key and
+// iv.
+//
+// WARNING: This ONLY handles ChaCha20.  XChaCha20 sub-key and IV
+// derivation is expected to be handled by the caller, so that the
+// HChaCha call can be suitably accelerated.
+init :: proc "contextless" (ctx: ^Context, key, iv: []byte, is_xchacha: bool) {
+	if len(key) != KEY_SIZE || len(iv) != IV_SIZE {
+		intrinsics.trap()
+	}
+
+	k, n := key, iv
+
+	ctx._s[0] = SIGMA_0
+	ctx._s[1] = SIGMA_1
+	ctx._s[2] = SIGMA_2
+	ctx._s[3] = SIGMA_3
+	ctx._s[4] = endian.unchecked_get_u32le(k[0:4])
+	ctx._s[5] = endian.unchecked_get_u32le(k[4:8])
+	ctx._s[6] = endian.unchecked_get_u32le(k[8:12])
+	ctx._s[7] = endian.unchecked_get_u32le(k[12:16])
+	ctx._s[8] = endian.unchecked_get_u32le(k[16:20])
+	ctx._s[9] = endian.unchecked_get_u32le(k[20:24])
+	ctx._s[10] = endian.unchecked_get_u32le(k[24:28])
+	ctx._s[11] = endian.unchecked_get_u32le(k[28:32])
+	ctx._s[12] = 0
+	ctx._s[13] = endian.unchecked_get_u32le(n[0:4])
+	ctx._s[14] = endian.unchecked_get_u32le(n[4:8])
+	ctx._s[15] = endian.unchecked_get_u32le(n[8:12])
+
+	ctx._off = BLOCK_SIZE
+	ctx._is_ietf_flavor = !is_xchacha
+	ctx._is_initialized = true
+}
+
+// seek seeks the (X)ChaCha20 stream counter to the specified block.
+seek :: proc(ctx: ^Context, block_nr: u64) {
+	assert(ctx._is_initialized)
+
+	if ctx._is_ietf_flavor {
+		if block_nr > MAX_CTR_IETF {
+			panic("crypto/chacha20: attempted to seek past maximum counter")
+		}
+	} else {
+		ctx._s[13] = u32(block_nr >> 32)
+	}
+	ctx._s[12] = u32(block_nr)
+	ctx._off = BLOCK_SIZE
+}
+
+// reset sanitizes the Context.  The Context must be re-initialized to
+// be used again.
+reset :: proc(ctx: ^Context) {
+	mem.zero_explicit(&ctx._s, size_of(ctx._s))
+	mem.zero_explicit(&ctx._buffer, size_of(ctx._buffer))
+
+	ctx._is_initialized = false
+}
+
+check_counter_limit :: proc(ctx: ^Context, nr_blocks: int) {
+	// Enforce the maximum consumed keystream per IV.
+	//
+	// While all modern "standard" definitions of ChaCha20 use
+	// the IETF 32-bit counter, for XChaCha20 most common
+	// implementations allow for a 64-bit counter.
+	//
+	// Honestly, the answer here is "use a MRAE primitive", but
+	// go with "common" practice in the case of XChaCha20.
+
+	ERR_CTR_EXHAUSTED :: "crypto/chacha20: maximum (X)ChaCha20 keystream per IV reached"
+
+	if ctx._is_ietf_flavor {
+		if u64(ctx._s[12]) + u64(nr_blocks) > MAX_CTR_IETF {
+			panic(ERR_CTR_EXHAUSTED)
+		}
+	} else {
+		ctr := (u64(ctx._s[13]) << 32) | u64(ctx._s[12])
+		if _, carry := bits.add_u64(ctr, u64(nr_blocks), 0); carry != 0 {
+			panic(ERR_CTR_EXHAUSTED)
+		}
+	}
+}
@@ -0,0 +1,360 @@
+package chacha20_ref
+
+import "core:crypto/_chacha20"
+import "core:encoding/endian"
+import "core:math/bits"
+
+stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) {
+	// Enforce the maximum consumed keystream per IV.
+	_chacha20.check_counter_limit(ctx, nr_blocks)
+
+	dst, src := dst, src
+	x := &ctx._s
+	for n := 0; n < nr_blocks; n = n + 1 {
+		x0, x1, x2, x3 :=
+			_chacha20.SIGMA_0, _chacha20.SIGMA_1, _chacha20.SIGMA_2, _chacha20.SIGMA_3
+		x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 :=
+			x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
+
+		for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
+			// Even when forcing inlining manually inlining all of
+			// these is decently faster.
+
+			// quarterround(x, 0, 4, 8, 12)
+			x0 += x4
+			x12 ~= x0
+			x12 = bits.rotate_left32(x12, 16)
+			x8 += x12
+			x4 ~= x8
+			x4 = bits.rotate_left32(x4, 12)
+			x0 += x4
+			x12 ~= x0
+			x12 = bits.rotate_left32(x12, 8)
+			x8 += x12
+			x4 ~= x8
+			x4 = bits.rotate_left32(x4, 7)
+
+			// quarterround(x, 1, 5, 9, 13)
+			x1 += x5
+			x13 ~= x1
+			x13 = bits.rotate_left32(x13, 16)
+			x9 += x13
+			x5 ~= x9
+			x5 = bits.rotate_left32(x5, 12)
+			x1 += x5
+			x13 ~= x1
+			x13 = bits.rotate_left32(x13, 8)
+			x9 += x13
+			x5 ~= x9
+			x5 = bits.rotate_left32(x5, 7)
+
+			// quarterround(x, 2, 6, 10, 14)
+			x2 += x6
+			x14 ~= x2
+			x14 = bits.rotate_left32(x14, 16)
+			x10 += x14
+			x6 ~= x10
+			x6 = bits.rotate_left32(x6, 12)
+			x2 += x6
+			x14 ~= x2
+			x14 = bits.rotate_left32(x14, 8)
+			x10 += x14
+			x6 ~= x10
+			x6 = bits.rotate_left32(x6, 7)
+
+			// quarterround(x, 3, 7, 11, 15)
+			x3 += x7
+			x15 ~= x3
+			x15 = bits.rotate_left32(x15, 16)
+			x11 += x15
+			x7 ~= x11
+			x7 = bits.rotate_left32(x7, 12)
+			x3 += x7
+			x15 ~= x3
+			x15 = bits.rotate_left32(x15, 8)
+			x11 += x15
+			x7 ~= x11
+			x7 = bits.rotate_left32(x7, 7)
+
+			// quarterround(x, 0, 5, 10, 15)
+			x0 += x5
+			x15 ~= x0
+			x15 = bits.rotate_left32(x15, 16)
+			x10 += x15
+			x5 ~= x10
+			x5 = bits.rotate_left32(x5, 12)
+			x0 += x5
+			x15 ~= x0
+			x15 = bits.rotate_left32(x15, 8)
+			x10 += x15
+			x5 ~= x10
+			x5 = bits.rotate_left32(x5, 7)
+
+			// quarterround(x, 1, 6, 11, 12)
+			x1 += x6
+			x12 ~= x1
+			x12 = bits.rotate_left32(x12, 16)
+			x11 += x12
+			x6 ~= x11
+			x6 = bits.rotate_left32(x6, 12)
+			x1 += x6
+			x12 ~= x1
+			x12 = bits.rotate_left32(x12, 8)
+			x11 += x12
+			x6 ~= x11
+			x6 = bits.rotate_left32(x6, 7)
+
+			// quarterround(x, 2, 7, 8, 13)
+			x2 += x7
+			x13 ~= x2
+			x13 = bits.rotate_left32(x13, 16)
+			x8 += x13
+			x7 ~= x8
+			x7 = bits.rotate_left32(x7, 12)
+			x2 += x7
+			x13 ~= x2
+			x13 = bits.rotate_left32(x13, 8)
+			x8 += x13
+			x7 ~= x8
+			x7 = bits.rotate_left32(x7, 7)
+
+			// quarterround(x, 3, 4, 9, 14)
+			x3 += x4
+			x14 ~= x3
+			x14 = bits.rotate_left32(x14, 16)
+			x9 += x14
+			x4 ~= x9
+			x4 = bits.rotate_left32(x4, 12)
+			x3 += x4
+			x14 ~= x3
+			x14 = bits.rotate_left32(x14, 8)
+			x9 += x14
+			x4 ~= x9
+			x4 = bits.rotate_left32(x4, 7)
+		}
+
+		x0 += _chacha20.SIGMA_0
+		x1 += _chacha20.SIGMA_1
+		x2 += _chacha20.SIGMA_2
+		x3 += _chacha20.SIGMA_3
+		x4 += x[4]
+		x5 += x[5]
+		x6 += x[6]
+		x7 += x[7]
+		x8 += x[8]
+		x9 += x[9]
+		x10 += x[10]
+		x11 += x[11]
+		x12 += x[12]
+		x13 += x[13]
+		x14 += x[14]
+		x15 += x[15]
+
+		// - The caller(s) ensure that src/dst are valid.
+		// - The compiler knows if the target is picky about alignment.
+
+		#no_bounds_check {
+			if src != nil {
+				endian.unchecked_put_u32le(dst[0:4], endian.unchecked_get_u32le(src[0:4]) ~ x0)
+				endian.unchecked_put_u32le(dst[4:8], endian.unchecked_get_u32le(src[4:8]) ~ x1)
+				endian.unchecked_put_u32le(dst[8:12], endian.unchecked_get_u32le(src[8:12]) ~ x2)
+				endian.unchecked_put_u32le(dst[12:16], endian.unchecked_get_u32le(src[12:16]) ~ x3)
+				endian.unchecked_put_u32le(dst[16:20], endian.unchecked_get_u32le(src[16:20]) ~ x4)
+				endian.unchecked_put_u32le(dst[20:24], endian.unchecked_get_u32le(src[20:24]) ~ x5)
+				endian.unchecked_put_u32le(dst[24:28], endian.unchecked_get_u32le(src[24:28]) ~ x6)
+				endian.unchecked_put_u32le(dst[28:32], endian.unchecked_get_u32le(src[28:32]) ~ x7)
+				endian.unchecked_put_u32le(dst[32:36], endian.unchecked_get_u32le(src[32:36]) ~ x8)
+				endian.unchecked_put_u32le(dst[36:40], endian.unchecked_get_u32le(src[36:40]) ~ x9)
+				endian.unchecked_put_u32le(
+					dst[40:44],
+					endian.unchecked_get_u32le(src[40:44]) ~ x10,
+				)
+				endian.unchecked_put_u32le(
+					dst[44:48],
+					endian.unchecked_get_u32le(src[44:48]) ~ x11,
+				)
+				endian.unchecked_put_u32le(
+					dst[48:52],
+					endian.unchecked_get_u32le(src[48:52]) ~ x12,
+				)
+				endian.unchecked_put_u32le(
+					dst[52:56],
+					endian.unchecked_get_u32le(src[52:56]) ~ x13,
+				)
+				endian.unchecked_put_u32le(
+					dst[56:60],
+					endian.unchecked_get_u32le(src[56:60]) ~ x14,
+				)
+				endian.unchecked_put_u32le(
+					dst[60:64],
+					endian.unchecked_get_u32le(src[60:64]) ~ x15,
+				)
+				src = src[_chacha20.BLOCK_SIZE:]
+			} else {
+				endian.unchecked_put_u32le(dst[0:4], x0)
+				endian.unchecked_put_u32le(dst[4:8], x1)
+				endian.unchecked_put_u32le(dst[8:12], x2)
+				endian.unchecked_put_u32le(dst[12:16], x3)
+				endian.unchecked_put_u32le(dst[16:20], x4)
+				endian.unchecked_put_u32le(dst[20:24], x5)
+				endian.unchecked_put_u32le(dst[24:28], x6)
+				endian.unchecked_put_u32le(dst[28:32], x7)
+				endian.unchecked_put_u32le(dst[32:36], x8)
+				endian.unchecked_put_u32le(dst[36:40], x9)
+				endian.unchecked_put_u32le(dst[40:44], x10)
+				endian.unchecked_put_u32le(dst[44:48], x11)
+				endian.unchecked_put_u32le(dst[48:52], x12)
+				endian.unchecked_put_u32le(dst[52:56], x13)
+				endian.unchecked_put_u32le(dst[56:60], x14)
+				endian.unchecked_put_u32le(dst[60:64], x15)
+			}
+			dst = dst[_chacha20.BLOCK_SIZE:]
+		}
+
+		// Increment the counter.  Overflow checking is done upon
+		// entry into the routine, so a 64-bit increment safely
+		// covers both cases.
+		new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + 1
+		x[12] = u32(new_ctr)
+		x[13] = u32(new_ctr >> 32)
+	}
+}
+
+hchacha20 :: proc "contextless" (dst, key, iv: []byte) {
+	x0, x1, x2, x3 := _chacha20.SIGMA_0, _chacha20.SIGMA_1, _chacha20.SIGMA_2, _chacha20.SIGMA_3
+	x4 := endian.unchecked_get_u32le(key[0:4])
+	x5 := endian.unchecked_get_u32le(key[4:8])
+	x6 := endian.unchecked_get_u32le(key[8:12])
+	x7 := endian.unchecked_get_u32le(key[12:16])
+	x8 := endian.unchecked_get_u32le(key[16:20])
+	x9 := endian.unchecked_get_u32le(key[20:24])
+	x10 := endian.unchecked_get_u32le(key[24:28])
+	x11 := endian.unchecked_get_u32le(key[28:32])
+	x12 := endian.unchecked_get_u32le(iv[0:4])
+	x13 := endian.unchecked_get_u32le(iv[4:8])
+	x14 := endian.unchecked_get_u32le(iv[8:12])
+	x15 := endian.unchecked_get_u32le(iv[12:16])
+
+	for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
+		// quarterround(x, 0, 4, 8, 12)
+		x0 += x4
+		x12 ~= x0
+		x12 = bits.rotate_left32(x12, 16)
+		x8 += x12
+		x4 ~= x8
+		x4 = bits.rotate_left32(x4, 12)
+		x0 += x4
+		x12 ~= x0
+		x12 = bits.rotate_left32(x12, 8)
+		x8 += x12
+		x4 ~= x8
+		x4 = bits.rotate_left32(x4, 7)
+
+		// quarterround(x, 1, 5, 9, 13)
+		x1 += x5
+		x13 ~= x1
+		x13 = bits.rotate_left32(x13, 16)
+		x9 += x13
+		x5 ~= x9
+		x5 = bits.rotate_left32(x5, 12)
+		x1 += x5
+		x13 ~= x1
+		x13 = bits.rotate_left32(x13, 8)
+		x9 += x13
+		x5 ~= x9
+		x5 = bits.rotate_left32(x5, 7)
+
+		// quarterround(x, 2, 6, 10, 14)
+		x2 += x6
+		x14 ~= x2
+		x14 = bits.rotate_left32(x14, 16)
+		x10 += x14
+		x6 ~= x10
+		x6 = bits.rotate_left32(x6, 12)
+		x2 += x6
+		x14 ~= x2
+		x14 = bits.rotate_left32(x14, 8)
+		x10 += x14
+		x6 ~= x10
+		x6 = bits.rotate_left32(x6, 7)
+
+		// quarterround(x, 3, 7, 11, 15)
+		x3 += x7
+		x15 ~= x3
+		x15 = bits.rotate_left32(x15, 16)
+		x11 += x15
+		x7 ~= x11
+		x7 = bits.rotate_left32(x7, 12)
+		x3 += x7
+		x15 ~= x3
+		x15 = bits.rotate_left32(x15, 8)
+		x11 += x15
+		x7 ~= x11
+		x7 = bits.rotate_left32(x7, 7)
+
+		// quarterround(x, 0, 5, 10, 15)
+		x0 += x5
+		x15 ~= x0
+		x15 = bits.rotate_left32(x15, 16)
+		x10 += x15
+		x5 ~= x10
+		x5 = bits.rotate_left32(x5, 12)
+		x0 += x5
+		x15 ~= x0
+		x15 = bits.rotate_left32(x15, 8)
+		x10 += x15
+		x5 ~= x10
+		x5 = bits.rotate_left32(x5, 7)
+
+		// quarterround(x, 1, 6, 11, 12)
+		x1 += x6
+		x12 ~= x1
+		x12 = bits.rotate_left32(x12, 16)
+		x11 += x12
+		x6 ~= x11
+		x6 = bits.rotate_left32(x6, 12)
+		x1 += x6
+		x12 ~= x1
+		x12 = bits.rotate_left32(x12, 8)
+		x11 += x12
+		x6 ~= x11
+		x6 = bits.rotate_left32(x6, 7)
+
+		// quarterround(x, 2, 7, 8, 13)
+		x2 += x7
+		x13 ~= x2
+		x13 = bits.rotate_left32(x13, 16)
+		x8 += x13
+		x7 ~= x8
+		x7 = bits.rotate_left32(x7, 12)
+		x2 += x7
+		x13 ~= x2
+		x13 = bits.rotate_left32(x13, 8)
+		x8 += x13
+		x7 ~= x8
+		x7 = bits.rotate_left32(x7, 7)
+
+		// quarterround(x, 3, 4, 9, 14)
+		x3 += x4
+		x14 ~= x3
+		x14 = bits.rotate_left32(x14, 16)
+		x9 += x14
+		x4 ~= x9
+		x4 = bits.rotate_left32(x4, 12)
+		x3 += x4
+		x14 ~= x3
+		x14 = bits.rotate_left32(x14, 8)
+		x9 += x14
+		x4 ~= x9
+		x4 = bits.rotate_left32(x4, 7)
+	}
+
+	endian.unchecked_put_u32le(dst[0:4], x0)
+	endian.unchecked_put_u32le(dst[4:8], x1)
+	endian.unchecked_put_u32le(dst[8:12], x2)
+	endian.unchecked_put_u32le(dst[12:16], x3)
+	endian.unchecked_put_u32le(dst[16:20], x12)
+	endian.unchecked_put_u32le(dst[20:24], x13)
+	endian.unchecked_put_u32le(dst[24:28], x14)
+	endian.unchecked_put_u32le(dst[28:32], x15)
+}
@@ -0,0 +1,481 @@
+package chacha20_simd128
+
+import "base:intrinsics"
+import "core:crypto/_chacha20"
+import "core:simd"
+@(require) import "core:sys/info"
+
+// Portable 128-bit `core:simd` implementation.
+//
+// This is loosely based on Ted Krovetz's public domain C intrinsic
+// implementation.
+//
+// This is written to perform adequately on any target that has "enough"
+// 128-bit vector registers, the current thought is that 4 blocks at at
+// time is reasonable for amd64, though Ted's code is more conservative.
+//
+// See:
+// supercop-20230530/crypto_stream/chacha20/krovetz/vec128
+
+// Ensure the compiler emits SIMD instructions.  This is a minimum, and
+// setting the microarchitecture at compile time will allow for better
+// code gen when applicable (eg: AVX).  This is somewhat redundant with
+// the default microarchitecture configurations.
+when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
+	@(private = "file")
+	TARGET_SIMD_FEATURES :: "neon"
+} else when ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 {
+	// Note: LLVM appears to be smart enough to use PSHUFB despite not
+	// explicitly using simd.u8x16 shuffles.
+	@(private = "file")
+	TARGET_SIMD_FEATURES :: "sse2,ssse3"
+} else {
+	@(private = "file")
+	TARGET_SIMD_FEATURES :: ""
+}
+
+@(private = "file")
+_ROT_7L: simd.u32x4 : {7, 7, 7, 7}
+@(private = "file")
+_ROT_7R: simd.u32x4 : {25, 25, 25, 25}
+@(private = "file")
+_ROT_12L: simd.u32x4 : {12, 12, 12, 12}
+@(private = "file")
+_ROT_12R: simd.u32x4 : {20, 20, 20, 20}
+@(private = "file")
+_ROT_8L: simd.u32x4 : {8, 8, 8, 8}
+@(private = "file")
+_ROT_8R: simd.u32x4 : {24, 24, 24, 24}
+@(private = "file")
+_ROT_16: simd.u32x4 : {16, 16, 16, 16}
+
+when ODIN_ENDIAN == .Big {
+	@(private = "file")
+	_increment_counter :: #force_inline proc "contextless" (ctx: ^Context) -> simd.u32x4 {
+		// In the Big Endian case, the low and high portions in the vector
+		// are flipped, so the 64-bit addition can't be done with a simple
+		// vector add.
+		x := &ctx._s
+
+		new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + 1
+		x[12] = u32(new_ctr)
+		x[13] = u32(new_ctr >> 32)
+
+		return intrinsics.unaligned_load(transmute(^simd.u32x4)&x[12])
+	}
+
+	// Convert the endian-ness of the components of a u32x4 vector, for
+	// the purposes of output.
+	@(private = "file")
+	_byteswap_u32x4 :: #force_inline proc "contextless" (v: simd.u32x4) -> simd.u32x4 {
+		return(
+			transmute(simd.u32x4)simd.shuffle(
+				transmute(simd.u8x16)v,
+				transmute(simd.u8x16)v,
+				3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
+			)
+		)
+	}
+} else {
+	@(private = "file")
+	_VEC_ONE: simd.u64x2 : {1, 0}
+}
+
+@(private = "file")
+_dq_round_simd128 :: #force_inline proc "contextless" (
+	v0, v1, v2, v3: simd.u32x4,
+) -> (
+	simd.u32x4,
+	simd.u32x4,
+	simd.u32x4,
+	simd.u32x4,
+) {
+	v0, v1, v2, v3 := v0, v1, v2, v3
+
+	// a += b; d ^= a; d = ROTW16(d);
+	v0 = simd.add(v0, v1)
+	v3 = simd.bit_xor(v3, v0)
+	v3 = simd.bit_xor(simd.shl(v3, _ROT_16), simd.shr(v3, _ROT_16))
+
+	// c += d; b ^= c; b = ROTW12(b);
+	v2 = simd.add(v2, v3)
+	v1 = simd.bit_xor(v1, v2)
+	v1 = simd.bit_xor(simd.shl(v1, _ROT_12L), simd.shr(v1, _ROT_12R))
+
+	// a += b; d ^= a; d = ROTW8(d);
+	v0 = simd.add(v0, v1)
+	v3 = simd.bit_xor(v3, v0)
+	v3 = simd.bit_xor(simd.shl(v3, _ROT_8L), simd.shr(v3, _ROT_8R))
+
+	// c += d; b ^= c; b = ROTW7(b);
+	v2 = simd.add(v2, v3)
+	v1 = simd.bit_xor(v1, v2)
+	v1 = simd.bit_xor(simd.shl(v1, _ROT_7L), simd.shr(v1, _ROT_7R))
+
+	// b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
+	v1 = simd.shuffle(v1, v1, 1, 2, 3, 0)
+	v2 = simd.shuffle(v2, v2, 2, 3, 0, 1)
+	v3 = simd.shuffle(v3, v3, 3, 0, 1, 2)
+
+	// a += b; d ^= a; d = ROTW16(d);
+	v0 = simd.add(v0, v1)
+	v3 = simd.bit_xor(v3, v0)
+	v3 = simd.bit_xor(simd.shl(v3, _ROT_16), simd.shr(v3, _ROT_16))
+
+	// c += d; b ^= c; b = ROTW12(b);
+	v2 = simd.add(v2, v3)
+	v1 = simd.bit_xor(v1, v2)
+	v1 = simd.bit_xor(simd.shl(v1, _ROT_12L), simd.shr(v1, _ROT_12R))
+
+	// a += b; d ^= a; d = ROTW8(d);
+	v0 = simd.add(v0, v1)
+	v3 = simd.bit_xor(v3, v0)
+	v3 = simd.bit_xor(simd.shl(v3, _ROT_8L), simd.shr(v3, _ROT_8R))
+
+	// c += d; b ^= c; b = ROTW7(b);
+	v2 = simd.add(v2, v3)
+	v1 = simd.bit_xor(v1, v2)
+	v1 = simd.bit_xor(simd.shl(v1, _ROT_7L), simd.shr(v1, _ROT_7R))
+
+	// b = ROTV3(b); c = ROTV2(c); d = ROTV1(d);
+	v1 = simd.shuffle(v1, v1, 3, 0, 1, 2)
+	v2 = simd.shuffle(v2, v2, 2, 3, 0, 1)
+	v3 = simd.shuffle(v3, v3, 1, 2, 3, 0)
+
+	return v0, v1, v2, v3
+}
+
+@(private = "file")
+_add_state_simd128 :: #force_inline proc "contextless" (
+	v0, v1, v2, v3, s0, s1, s2, s3: simd.u32x4,
+) -> (
+	simd.u32x4,
+	simd.u32x4,
+	simd.u32x4,
+	simd.u32x4,
+) {
+	v0, v1, v2, v3 := v0, v1, v2, v3
+
+	v0 = simd.add(v0, s0)
+	v1 = simd.add(v1, s1)
+	v2 = simd.add(v2, s2)
+	v3 = simd.add(v3, s3)
+
+	when ODIN_ENDIAN == .Big {
+		v0 = _byteswap_u32x4(v0)
+		v1 = _byteswap_u32x4(v1)
+		v2 = _byteswap_u32x4(v2)
+		v3 = _byteswap_u32x4(v3)
+	}
+
+	return v0, v1, v2, v3
+}
+
+@(private = "file")
+_xor_simd128 :: #force_inline proc "contextless" (
+	src: [^]simd.u32x4,
+	v0, v1, v2, v3: simd.u32x4,
+) -> (
+	simd.u32x4,
+	simd.u32x4,
+	simd.u32x4,
+	simd.u32x4,
+) {
+	v0, v1, v2, v3 := v0, v1, v2, v3
+
+	v0 = simd.bit_xor(v0, intrinsics.unaligned_load((^simd.u32x4)(src[0:])))
+	v1 = simd.bit_xor(v1, intrinsics.unaligned_load((^simd.u32x4)(src[1:])))
+	v2 = simd.bit_xor(v2, intrinsics.unaligned_load((^simd.u32x4)(src[2:])))
+	v3 = simd.bit_xor(v3, intrinsics.unaligned_load((^simd.u32x4)(src[3:])))
+
+	return v0, v1, v2, v3
+}
+
+@(private = "file")
+_store_simd128 :: #force_inline proc "contextless" (
+	dst: [^]simd.u32x4,
+	v0, v1, v2, v3: simd.u32x4,
+) {
+	intrinsics.unaligned_store((^simd.u32x4)(dst[0:]), v0)
+	intrinsics.unaligned_store((^simd.u32x4)(dst[1:]), v1)
+	intrinsics.unaligned_store((^simd.u32x4)(dst[2:]), v2)
+	intrinsics.unaligned_store((^simd.u32x4)(dst[3:]), v3)
+}
+
+// is_performant returns true iff the target and current host both support
+// "enough" 128-bit SIMD to make this implementation performant.
+is_performant :: proc "contextless" () -> bool {
+	when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 || ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 {
+		when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
+			req_features :: info.CPU_Features{.asimd}
+		} else when ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 {
+			req_features :: info.CPU_Features{.sse2, .ssse3}
+		}
+
+		features, ok := info.cpu_features.?
+		if !ok {
+			return false
+		}
+
+		return features >= req_features
+	} else when ODIN_ARCH == .wasm64p32 || ODIN_ARCH == .wasm32 {
+		return intrinsics.has_target_feature("simd128")
+	} else {
+		return false
+	}
+}
+
+@(enable_target_feature = TARGET_SIMD_FEATURES)
+stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) {
+	// Enforce the maximum consumed keystream per IV.
+	_chacha20.check_counter_limit(ctx, nr_blocks)
+
+	dst_v := ([^]simd.u32x4)(raw_data(dst))
+	src_v := ([^]simd.u32x4)(raw_data(src))
+
+	x := &ctx._s
+	n := nr_blocks
+
+	// The state vector is an array of uint32s in native byte-order.
+	x_v := ([^]simd.u32x4)(raw_data(x))
+	s0 := intrinsics.unaligned_load((^simd.u32x4)(x_v[0:]))
+	s1 := intrinsics.unaligned_load((^simd.u32x4)(x_v[1:]))
+	s2 := intrinsics.unaligned_load((^simd.u32x4)(x_v[2:]))
+	s3 := intrinsics.unaligned_load((^simd.u32x4)(x_v[3:]))
+
+	// 8 blocks at a time.
+	//
+	// Note: This is only worth it on Aarch64.
+	when ODIN_ARCH == .arm64 {
+		for ; n >= 8; n = n - 8 {
+			v0, v1, v2, v3 := s0, s1, s2, s3
+
+			when ODIN_ENDIAN == .Little {
+				s7 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s3, _VEC_ONE)
+			} else {
+				s7 := _increment_counter(ctx)
+			}
+			v4, v5, v6, v7 := s0, s1, s2, s7
+
+			when ODIN_ENDIAN == .Little {
+				s11 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s7, _VEC_ONE)
+			} else {
+				s11 := _increment_counter(ctx)
+			}
+			v8, v9, v10, v11 := s0, s1, s2, s11
+
+			when ODIN_ENDIAN == .Little {
+				s15 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s11, _VEC_ONE)
+			} else {
+				s15 := _increment_counter(ctx)
+			}
+			v12, v13, v14, v15 := s0, s1, s2, s15
+
+			when ODIN_ENDIAN == .Little {
+				s19 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s15, _VEC_ONE)
+			} else {
+				s19 := _increment_counter(ctx)
+			}
+
+			v16, v17, v18, v19 := s0, s1, s2, s19
+			when ODIN_ENDIAN == .Little {
+				s23 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s19, _VEC_ONE)
+			} else {
+				s23 := _increment_counter(ctx)
+			}
+
+			v20, v21, v22, v23 := s0, s1, s2, s23
+			when ODIN_ENDIAN == .Little {
+				s27 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s23, _VEC_ONE)
+			} else {
+				s27 := _increment_counter(ctx)
+			}
+
+			v24, v25, v26, v27 := s0, s1, s2, s27
+			when ODIN_ENDIAN == .Little {
+				s31 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s27, _VEC_ONE)
+			} else {
+				s31 := _increment_counter(ctx)
+			}
+			v28, v29, v30, v31 := s0, s1, s2, s31
+
+			for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
+				v0, v1, v2, v3 = _dq_round_simd128(v0, v1, v2, v3)
+				v4, v5, v6, v7 = _dq_round_simd128(v4, v5, v6, v7)
+				v8, v9, v10, v11 = _dq_round_simd128(v8, v9, v10, v11)
+				v12, v13, v14, v15 = _dq_round_simd128(v12, v13, v14, v15)
+				v16, v17, v18, v19 = _dq_round_simd128(v16, v17, v18, v19)
+				v20, v21, v22, v23 = _dq_round_simd128(v20, v21, v22, v23)
+				v24, v25, v26, v27 = _dq_round_simd128(v24, v25, v26, v27)
+				v28, v29, v30, v31 = _dq_round_simd128(v28, v29, v30, v31)
+			}
+
+			v0, v1, v2, v3 = _add_state_simd128(v0, v1, v2, v3, s0, s1, s2, s3)
+			v4, v5, v6, v7 = _add_state_simd128(v4, v5, v6, v7, s0, s1, s2, s7)
+			v8, v9, v10, v11 = _add_state_simd128(v8, v9, v10, v11, s0, s1, s2, s11)
+			v12, v13, v14, v15 = _add_state_simd128(v12, v13, v14, v15, s0, s1, s2, s15)
+			v16, v17, v18, v19 = _add_state_simd128(v16, v17, v18, v19, s0, s1, s2, s19)
+			v20, v21, v22, v23 = _add_state_simd128(v20, v21, v22, v23, s0, s1, s2, s23)
+			v24, v25, v26, v27 = _add_state_simd128(v24, v25, v26, v27, s0, s1, s2, s27)
+			v28, v29, v30, v31 = _add_state_simd128(v28, v29, v30, v31, s0, s1, s2, s31)
+
+			#no_bounds_check {
+				if src != nil {
+					v0, v1, v2, v3 = _xor_simd128(src_v, v0, v1, v2, v3)
+					v4, v5, v6, v7 = _xor_simd128(src_v[4:], v4, v5, v6, v7)
+					v8, v9, v10, v11 = _xor_simd128(src_v[8:], v8, v9, v10, v11)
+					v12, v13, v14, v15 = _xor_simd128(src_v[12:], v12, v13, v14, v15)
+					v16, v17, v18, v19 = _xor_simd128(src_v[16:], v16, v17, v18, v19)
+					v20, v21, v22, v23 = _xor_simd128(src_v[20:], v20, v21, v22, v23)
+					v24, v25, v26, v27 = _xor_simd128(src_v[24:], v24, v25, v26, v27)
+					v28, v29, v30, v31 = _xor_simd128(src_v[28:], v28, v29, v30, v31)
+					src_v = src_v[32:]
+				}
+
+				_store_simd128(dst_v, v0, v1, v2, v3)
+				_store_simd128(dst_v[4:], v4, v5, v6, v7)
+				_store_simd128(dst_v[8:], v8, v9, v10, v11)
+				_store_simd128(dst_v[12:], v12, v13, v14, v15)
+				_store_simd128(dst_v[16:], v16, v17, v18, v19)
+				_store_simd128(dst_v[20:], v20, v21, v22, v23)
+				_store_simd128(dst_v[24:], v24, v25, v26, v27)
+				_store_simd128(dst_v[28:], v28, v29, v30, v31)
+				dst_v = dst_v[32:]
+			}
+
+			when ODIN_ENDIAN == .Little {
+				// s31 holds the most current counter, so `s3 = s31 + 1`.
+				s3 = transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s31, _VEC_ONE)
+			} else {
+				s3 = _increment_counter(ctx)
+			}
+		}
+	}
+
+	// 4 blocks at a time.
+	//
+	// Note: The i386 target lacks the required number of registers
+	// for this to be performant, so it is skipped.
+	when ODIN_ARCH != .i386 {
+		for ; n >= 4; n = n - 4 {
+			v0, v1, v2, v3 := s0, s1, s2, s3
+
+			when ODIN_ENDIAN == .Little {
+				s7 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s3, _VEC_ONE)
+			} else {
+				s7 := _increment_counter(ctx)
+			}
+			v4, v5, v6, v7 := s0, s1, s2, s7
+
+			when ODIN_ENDIAN == .Little {
+				s11 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s7, _VEC_ONE)
+			} else {
+				s11 := _increment_counter(ctx)
+			}
+			v8, v9, v10, v11 := s0, s1, s2, s11
+
+			when ODIN_ENDIAN == .Little {
+				s15 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s11, _VEC_ONE)
+			} else {
+				s15 := _increment_counter(ctx)
+			}
+			v12, v13, v14, v15 := s0, s1, s2, s15
+
+			for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
+				v0, v1, v2, v3 = _dq_round_simd128(v0, v1, v2, v3)
+				v4, v5, v6, v7 = _dq_round_simd128(v4, v5, v6, v7)
+				v8, v9, v10, v11 = _dq_round_simd128(v8, v9, v10, v11)
+				v12, v13, v14, v15 = _dq_round_simd128(v12, v13, v14, v15)
+			}
+
+			v0, v1, v2, v3 = _add_state_simd128(v0, v1, v2, v3, s0, s1, s2, s3)
+			v4, v5, v6, v7 = _add_state_simd128(v4, v5, v6, v7, s0, s1, s2, s7)
+			v8, v9, v10, v11 = _add_state_simd128(v8, v9, v10, v11, s0, s1, s2, s11)
+			v12, v13, v14, v15 = _add_state_simd128(v12, v13, v14, v15, s0, s1, s2, s15)
+
+			#no_bounds_check {
+				if src != nil {
+					v0, v1, v2, v3 = _xor_simd128(src_v, v0, v1, v2, v3)
+					v4, v5, v6, v7 = _xor_simd128(src_v[4:], v4, v5, v6, v7)
+					v8, v9, v10, v11 = _xor_simd128(src_v[8:], v8, v9, v10, v11)
+					v12, v13, v14, v15 = _xor_simd128(src_v[12:], v12, v13, v14, v15)
+					src_v = src_v[16:]
+				}
+
+				_store_simd128(dst_v, v0, v1, v2, v3)
+				_store_simd128(dst_v[4:], v4, v5, v6, v7)
+				_store_simd128(dst_v[8:], v8, v9, v10, v11)
+				_store_simd128(dst_v[12:], v12, v13, v14, v15)
+				dst_v = dst_v[16:]
+			}
+
+			when ODIN_ENDIAN == .Little {
+				// s15 holds the most current counter, so `s3 = s15 + 1`.
+				s3 = transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s15, _VEC_ONE)
+			} else {
+				s3 = _increment_counter(ctx)
+			}
+		}
+	}
+
+	// 1 block at a time.
+	for ; n > 0; n = n - 1 {
+		v0, v1, v2, v3 := s0, s1, s2, s3
+
+		for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
+			v0, v1, v2, v3 = _dq_round_simd128(v0, v1, v2, v3)
+		}
+		v0, v1, v2, v3 = _add_state_simd128(v0, v1, v2, v3, s0, s1, s2, s3)
+
+		#no_bounds_check {
+			if src != nil {
+				v0, v1, v2, v3 = _xor_simd128(src_v, v0, v1, v2, v3)
+				src_v = src_v[4:]
+			}
+
+			_store_simd128(dst_v, v0, v1, v2, v3)
+			dst_v = dst_v[4:]
+		}
+
+		// Increment the counter.  Overflow checking is done upon
+		// entry into the routine, so a 64-bit increment safely
+		// covers both cases.
+		when ODIN_ENDIAN == .Little {
+			s3 = transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s3, _VEC_ONE)
+		} else {
+			s3 = _increment_counter(ctx)
+		}
+	}
+
+	when ODIN_ENDIAN == .Little {
+		// Write back the counter to the state.
+		intrinsics.unaligned_store((^simd.u32x4)(x_v[3:]), s3)
+	}
+}
+
+@(enable_target_feature = TARGET_SIMD_FEATURES)
+hchacha20 :: proc "contextless" (dst, key, iv: []byte) {
+	v0 := simd.u32x4{_chacha20.SIGMA_0, _chacha20.SIGMA_1, _chacha20.SIGMA_2, _chacha20.SIGMA_3}
+	v1 := intrinsics.unaligned_load((^simd.u32x4)(&key[0]))
+	v2 := intrinsics.unaligned_load((^simd.u32x4)(&key[16]))
+	v3 := intrinsics.unaligned_load((^simd.u32x4)(&iv[0]))
+
+	when ODIN_ENDIAN == .Big {
+		v1 = _byteswap_u32x4(v1)
+		v2 = _byteswap_u32x4(v2)
+		v3 = _byteswap_u32x4(v3)
+	}
+
+	for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
+		v0, v1, v2, v3 = _dq_round_simd128(v0, v1, v2, v3)
+	}
+
+	when ODIN_ENDIAN == .Big {
+		v0 = _byteswap_u32x4(v0)
+		v3 = _byteswap_u32x4(v3)
+	}
+
+	dst_v := ([^]simd.u32x4)(raw_data(dst))
+	intrinsics.unaligned_store((^simd.u32x4)(dst_v[0:]), v0)
+	intrinsics.unaligned_store((^simd.u32x4)(dst_v[1:]), v3)
+}
@@ -0,0 +1,319 @@
+//+build amd64
+package chacha20_simd256
+
+import "base:intrinsics"
+import "core:crypto/_chacha20"
+import chacha_simd128 "core:crypto/_chacha20/simd128"
+import "core:simd"
+import "core:sys/info"
+
+// This is loosely based on Ted Krovetz's public domain C intrinsic
+// implementations.  While written using `core:simd`, this is currently
+// amd64 specific because we do not have a way to detect ARM SVE.
+//
+// See:
+// supercop-20230530/crypto_stream/chacha20/krovetz/vec128
+// supercop-20230530/crypto_stream/chacha20/krovetz/avx2
+
+#assert(ODIN_ENDIAN == .Little)
+
+@(private = "file")
+_ROT_7L: simd.u32x8 : {7, 7, 7, 7, 7, 7, 7, 7}
+@(private = "file")
+_ROT_7R: simd.u32x8 : {25, 25, 25, 25, 25, 25, 25, 25}
+@(private = "file")
+_ROT_12L: simd.u32x8 : {12, 12, 12, 12, 12, 12, 12, 12}
+@(private = "file")
+_ROT_12R: simd.u32x8 : {20, 20, 20, 20, 20, 20, 20, 20}
+@(private = "file")
+_ROT_8L: simd.u32x8 : {8, 8, 8, 8, 8, 8, 8, 8}
+@(private = "file")
+_ROT_8R: simd.u32x8 : {24, 24, 24, 24, 24, 24, 24, 24}
+@(private = "file")
+_ROT_16: simd.u32x8 : {16, 16, 16, 16, 16, 16, 16, 16}
+@(private = "file")
+_VEC_ZERO_ONE: simd.u64x4 : {0, 0, 1, 0}
+@(private = "file")
+_VEC_TWO: simd.u64x4 : {2, 0, 2, 0}
+
+// is_performant returns true iff the target and current host both support
+// "enough" SIMD to make this implementation performant.
+is_performant :: proc "contextless" () -> bool {
+	req_features :: info.CPU_Features{.avx, .avx2}
+
+	features, ok := info.cpu_features.?
+	if !ok {
+		return false
+	}
+
+	return features >= req_features
+}
+
+@(private = "file")
+_dq_round_simd256 :: #force_inline proc "contextless" (
+	v0, v1, v2, v3: simd.u32x8,
+) -> (
+	simd.u32x8,
+	simd.u32x8,
+	simd.u32x8,
+	simd.u32x8,
+) {
+	v0, v1, v2, v3 := v0, v1, v2, v3
+
+	// a += b; d ^= a; d = ROTW16(d);
+	v0 = simd.add(v0, v1)
+	v3 = simd.bit_xor(v3, v0)
+	v3 = simd.bit_xor(simd.shl(v3, _ROT_16), simd.shr(v3, _ROT_16))
+
+	// c += d; b ^= c; b = ROTW12(b);
+	v2 = simd.add(v2, v3)
+	v1 = simd.bit_xor(v1, v2)
+	v1 = simd.bit_xor(simd.shl(v1, _ROT_12L), simd.shr(v1, _ROT_12R))
+
+	// a += b; d ^= a; d = ROTW8(d);
+	v0 = simd.add(v0, v1)
+	v3 = simd.bit_xor(v3, v0)
+	v3 = simd.bit_xor(simd.shl(v3, _ROT_8L), simd.shr(v3, _ROT_8R))
+
+	// c += d; b ^= c; b = ROTW7(b);
+	v2 = simd.add(v2, v3)
+	v1 = simd.bit_xor(v1, v2)
+	v1 = simd.bit_xor(simd.shl(v1, _ROT_7L), simd.shr(v1, _ROT_7R))
+
+	// b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
+	v1 = simd.shuffle(v1, v1, 1, 2, 3, 0, 5, 6, 7, 4)
+	v2 = simd.shuffle(v2, v2, 2, 3, 0, 1, 6, 7, 4, 5)
+	v3 = simd.shuffle(v3, v3, 3, 0, 1, 2, 7, 4, 5, 6)
+
+	// a += b; d ^= a; d = ROTW16(d);
+	v0 = simd.add(v0, v1)
+	v3 = simd.bit_xor(v3, v0)
+	v3 = simd.bit_xor(simd.shl(v3, _ROT_16), simd.shr(v3, _ROT_16))
+
+	// c += d; b ^= c; b = ROTW12(b);
+	v2 = simd.add(v2, v3)
+	v1 = simd.bit_xor(v1, v2)
+	v1 = simd.bit_xor(simd.shl(v1, _ROT_12L), simd.shr(v1, _ROT_12R))
+
+	// a += b; d ^= a; d = ROTW8(d);
+	v0 = simd.add(v0, v1)
+	v3 = simd.bit_xor(v3, v0)
+	v3 = simd.bit_xor(simd.shl(v3, _ROT_8L), simd.shr(v3, _ROT_8R))
+
+	// c += d; b ^= c; b = ROTW7(b);
+	v2 = simd.add(v2, v3)
+	v1 = simd.bit_xor(v1, v2)
+	v1 = simd.bit_xor(simd.shl(v1, _ROT_7L), simd.shr(v1, _ROT_7R))
+
+	// b = ROTV3(b); c = ROTV2(c); d = ROTV1(d);
+	v1 = simd.shuffle(v1, v1, 3, 0, 1, 2, 7, 4, 5, 6)
+	v2 = simd.shuffle(v2, v2, 2, 3, 0, 1, 6, 7, 4, 5)
+	v3 = simd.shuffle(v3, v3, 1, 2, 3, 0, 5, 6, 7, 4)
+
+	return v0, v1, v2, v3
+}
+
+@(private = "file")
+_add_and_permute_state_simd256 :: #force_inline proc "contextless" (
+	v0, v1, v2, v3, s0, s1, s2, s3: simd.u32x8,
+) -> (
+	simd.u32x8,
+	simd.u32x8,
+	simd.u32x8,
+	simd.u32x8,
+) {
+	t0 := simd.add(v0, s0)
+	t1 := simd.add(v1, s1)
+	t2 := simd.add(v2, s2)
+	t3 := simd.add(v3, s3)
+
+	// Big Endian would byteswap here.
+
+	// Each of v0 .. v3 has 128-bits of keystream for 2 separate blocks.
+	// permute the state such that (r0, r1) contains block 0, and (r2, r3)
+	// contains block 1.
+	r0 := simd.shuffle(t0, t1, 0, 1, 2, 3, 8, 9, 10, 11)
+	r2 := simd.shuffle(t0, t1, 4, 5, 6, 7, 12, 13, 14, 15)
+	r1 := simd.shuffle(t2, t3, 0, 1, 2, 3, 8, 9, 10, 11)
+	r3 := simd.shuffle(t2, t3, 4, 5, 6, 7, 12, 13, 14, 15)
+
+	return r0, r1, r2, r3
+}
+
+@(private = "file")
+_xor_simd256 :: #force_inline proc "contextless" (
+	src: [^]simd.u32x8,
+	v0, v1, v2, v3: simd.u32x8,
+) -> (
+	simd.u32x8,
+	simd.u32x8,
+	simd.u32x8,
+	simd.u32x8,
+) {
+	v0, v1, v2, v3 := v0, v1, v2, v3
+
+	v0 = simd.bit_xor(v0, intrinsics.unaligned_load((^simd.u32x8)(src[0:])))
+	v1 = simd.bit_xor(v1, intrinsics.unaligned_load((^simd.u32x8)(src[1:])))
+	v2 = simd.bit_xor(v2, intrinsics.unaligned_load((^simd.u32x8)(src[2:])))
+	v3 = simd.bit_xor(v3, intrinsics.unaligned_load((^simd.u32x8)(src[3:])))
+
+	return v0, v1, v2, v3
+}
+
+@(private = "file")
+_xor_simd256_x1 :: #force_inline proc "contextless" (
+	src: [^]simd.u32x8,
+	v0, v1: simd.u32x8,
+) -> (
+	simd.u32x8,
+	simd.u32x8,
+) {
+	v0, v1 := v0, v1
+
+	v0 = simd.bit_xor(v0, intrinsics.unaligned_load((^simd.u32x8)(src[0:])))
+	v1 = simd.bit_xor(v1, intrinsics.unaligned_load((^simd.u32x8)(src[1:])))
+
+	return v0, v1
+}
+
+@(private = "file")
+_store_simd256 :: #force_inline proc "contextless" (
+	dst: [^]simd.u32x8,
+	v0, v1, v2, v3: simd.u32x8,
+) {
+	intrinsics.unaligned_store((^simd.u32x8)(dst[0:]), v0)
+	intrinsics.unaligned_store((^simd.u32x8)(dst[1:]), v1)
+	intrinsics.unaligned_store((^simd.u32x8)(dst[2:]), v2)
+	intrinsics.unaligned_store((^simd.u32x8)(dst[3:]), v3)
+}
+
+@(private = "file")
+_store_simd256_x1 :: #force_inline proc "contextless" (
+	dst: [^]simd.u32x8,
+	v0, v1: simd.u32x8,
+) {
+	intrinsics.unaligned_store((^simd.u32x8)(dst[0:]), v0)
+	intrinsics.unaligned_store((^simd.u32x8)(dst[1:]), v1)
+}
+
+@(enable_target_feature = "sse2,ssse3,avx,avx2")
+stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) {
+	// Enforce the maximum consumed keystream per IV.
+	_chacha20.check_counter_limit(ctx, nr_blocks)
+
+	dst_v := ([^]simd.u32x8)(raw_data(dst))
+	src_v := ([^]simd.u32x8)(raw_data(src))
+
+	x := &ctx._s
+	n := nr_blocks
+
+	// The state vector is an array of uint32s in native byte-order.
+	// Setup s0 .. s3 such that each register stores 2 copies of the
+	// state.
+	x_v := ([^]simd.u32x4)(raw_data(x))
+	t0 := intrinsics.unaligned_load((^simd.u32x4)(x_v[0:]))
+	t1 := intrinsics.unaligned_load((^simd.u32x4)(x_v[1:]))
+	t2 := intrinsics.unaligned_load((^simd.u32x4)(x_v[2:]))
+	t3 := intrinsics.unaligned_load((^simd.u32x4)(x_v[3:]))
+	s0 := simd.swizzle(t0, 0, 1, 2, 3, 0, 1, 2, 3)
+	s1 := simd.swizzle(t1, 0, 1, 2, 3, 0, 1, 2, 3)
+	s2 := simd.swizzle(t2, 0, 1, 2, 3, 0, 1, 2, 3)
+	s3 := simd.swizzle(t3, 0, 1, 2, 3, 0, 1, 2, 3)
+
+	// Advance the counter in the 2nd copy of the state by one.
+	s3 = transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s3, _VEC_ZERO_ONE)
+
+	// 8 blocks at a time.
+	for ; n >= 8; n = n - 8 {
+		v0, v1, v2, v3 := s0, s1, s2, s3
+
+		s7 := transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s3, _VEC_TWO)
+		v4, v5, v6, v7 := s0, s1, s2, s7
+
+		s11 := transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s7, _VEC_TWO)
+		v8, v9, v10, v11 := s0, s1, s2, s11
+
+		s15 := transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s11, _VEC_TWO)
+		v12, v13, v14, v15 := s0, s1, s2, s15
+
+		for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
+			v0, v1, v2, v3 = _dq_round_simd256(v0, v1, v2, v3)
+			v4, v5, v6, v7 = _dq_round_simd256(v4, v5, v6, v7)
+			v8, v9, v10, v11 = _dq_round_simd256(v8, v9, v10, v11)
+			v12, v13, v14, v15 = _dq_round_simd256(v12, v13, v14, v15)
+		}
+
+		v0, v1, v2, v3 = _add_and_permute_state_simd256(v0, v1, v2, v3, s0, s1, s2, s3)
+		v4, v5, v6, v7 = _add_and_permute_state_simd256(v4, v5, v6, v7, s0, s1, s2, s7)
+		v8, v9, v10, v11 = _add_and_permute_state_simd256(v8, v9, v10, v11, s0, s1, s2, s11)
+		v12, v13, v14, v15 = _add_and_permute_state_simd256(v12, v13, v14, v15, s0, s1, s2, s15)
+
+		#no_bounds_check {
+			if src != nil {
+				v0, v1, v2, v3 = _xor_simd256(src_v, v0, v1, v2, v3)
+				v4, v5, v6, v7 = _xor_simd256(src_v[4:], v4, v5, v6, v7)
+				v8, v9, v10, v11 = _xor_simd256(src_v[8:], v8, v9, v10, v11)
+				v12, v13, v14, v15 = _xor_simd256(src_v[12:], v12, v13, v14, v15)
+				src_v = src_v[16:]
+			}
+
+			_store_simd256(dst_v, v0, v1, v2, v3)
+			_store_simd256(dst_v[4:], v4, v5, v6, v7)
+			_store_simd256(dst_v[8:], v8, v9, v10, v11)
+			_store_simd256(dst_v[12:], v12, v13, v14, v15)
+			dst_v = dst_v[16:]
+		}
+
+		s3 = transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s15, _VEC_TWO)
+	}
+
+
+	// 2 (or 1) block at a time.
+	for ; n > 0; n = n - 2 {
+		v0, v1, v2, v3 := s0, s1, s2, s3
+
+		for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
+			v0, v1, v2, v3 = _dq_round_simd256(v0, v1, v2, v3)
+		}
+		v0, v1, v2, v3 = _add_and_permute_state_simd256(v0, v1, v2, v3, s0, s1, s2, s3)
+
+		if n == 1 {
+			// Note: No need to advance src_v, dst_v, or increment the counter
+			// since this is guaranteed to be the final block.
+			#no_bounds_check {
+				if src != nil {
+					v0, v1 = _xor_simd256_x1(src_v, v0, v1)
+				}
+
+				_store_simd256_x1(dst_v, v0, v1)
+			}
+			break
+		}
+
+		#no_bounds_check {
+			if src != nil {
+				v0, v1, v2, v3 = _xor_simd256(src_v, v0, v1, v2, v3)
+				src_v = src_v[4:]
+			}
+
+			_store_simd256(dst_v, v0, v1, v2, v3)
+			dst_v = dst_v[4:]
+		}
+
+		s3 = transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s3, _VEC_TWO)
+	}
+
+	// Write back the counter.  Doing it this way, saves having to
+	// pull out the correct counter value from s3.
+	new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + u64(nr_blocks)
+	ctx._s[12] = u32(new_ctr)
+	ctx._s[13] = u32(new_ctr >> 32)
+}
+
+@(enable_target_feature = "sse2,ssse3,avx")
+hchacha20 :: proc "contextless" (dst, key, iv: []byte) {
+	// We can just enable AVX and call the simd128 code as going
+	// wider has 0 performance benefit, but VEX encoded instructions
+	// is nice.
+	#force_inline chacha_simd128.hchacha20(dst, key, iv)
+}
@@ -0,0 +1,17 @@
+//+build !amd64
+package chacha20_simd256
+
+import "base:intrinsics"
+import "core:crypto/_chacha20"
+
+is_performant :: proc "contextless" () -> bool {
+	return false
+}
+
+stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) {
+	panic("crypto/chacha20: simd256 implementation unsupported")
+}
+
+hchacha20 :: proc "contextless" (dst, key, iv: []byte) {
+	intrinsics.trap()
+}
@@ -0,0 +1,36 @@
+package aead
+
+// seal_oneshot encrypts the plaintext and authenticates the aad and ciphertext,
+// with the provided algorithm, key, and iv, stores the output in dst and tag.
+//
+// dst and plaintext MUST alias exactly or not at all.
+seal_oneshot :: proc(algo: Algorithm, dst, tag, key, iv, aad, plaintext: []byte, impl: Implementation = nil) {
+	ctx: Context
+	init(&ctx, algo, key, impl)
+	defer reset(&ctx)
+	seal_ctx(&ctx, dst, tag, iv, aad, plaintext)
+}
+
+// open authenticates the aad and ciphertext, and decrypts the ciphertext,
+// with the provided algorithm, key, iv, and tag, and stores the output in dst,
+// returning true iff the authentication was successful.  If authentication
+// fails, the destination buffer will be zeroed.
+//
+// dst and plaintext MUST alias exactly or not at all.
+@(require_results)
+open_oneshot :: proc(algo: Algorithm, dst, key, iv, aad, ciphertext, tag: []byte, impl: Implementation = nil) -> bool {
+	ctx: Context
+	init(&ctx, algo, key, impl)
+	defer reset(&ctx)
+	return open_ctx(&ctx, dst, iv, aad, ciphertext, tag)
+}
+
+seal :: proc {
+	seal_ctx,
+	seal_oneshot,
+}
+
+open :: proc {
+	open_ctx,
+	open_oneshot,
+}
@@ -0,0 +1,57 @@
+/*
+package aead provides a generic interface to the supported Authenticated
+Encryption with Associated Data algorithms.
+
+Both a one-shot and context based interface are provided, with similar
+usage.  If multiple messages are to be sealed/opened via the same key,
+the context based interface may be more efficient, depending on the
+algorithm.
+
+WARNING: Reusing the same key + iv to seal (encrypt) multiple messages
+results in catastrophic loss of security for most algorithms.
+
+Example:
+	package aead_example
+
+	import "core:bytes"
+	import "core:crypto"
+	import "core:crypto/aead"
+
+	main :: proc() {
+		algo := aead.Algorithm.XCHACHA20POLY1305
+
+		// The example added associated data, and plaintext.
+		aad_str := "Get your ass in gear boys."
+		pt_str := "They're immanetizing the Eschaton."
+
+		aad := transmute([]byte)aad_str
+		plaintext := transmute([]byte)pt_str
+		pt_len := len(plaintext)
+
+		// Generate a random key for the purposes of illustration.
+		key := make([]byte, aead.KEY_SIZES[algo])
+		defer delete(key)
+		crypto.rand_bytes(key)
+
+		// `ciphertext || tag`, is a common way data is transmitted, so
+		// demonstrate that.
+		buf := make([]byte, pt_len + aead.TAG_SIZES[algo])
+		defer delete(buf)
+		ciphertext, tag := buf[:pt_len], buf[pt_len:]
+
+		// Seal the AAD + Plaintext.
+		iv := make([]byte, aead.IV_SIZES[algo])
+		defer delete(iv)
+		crypto.rand_bytes(iv) // Random IVs are safe with XChaCha20-Poly1305.
+		aead.seal(algo, ciphertext, tag, key, iv, aad, plaintext)
+
+		// Open the AAD + Ciphertext.
+		opened_pt := buf[:pt_len]
+		if ok := aead.open(algo, opened_pt, key, iv, aad, ciphertext, tag); !ok {
+			panic("aead example: failed to open")
+		}
+
+		assert(bytes.equal(opened_pt, plaintext))
+	}
+*/
+package aead
@@ -0,0 +1,187 @@
+package aead
+
+import "core:crypto/aes"
+import "core:crypto/chacha20"
+import "core:crypto/chacha20poly1305"
+import "core:reflect"
+
+// Implementation is an AEAD implementation.  Most callers will not need
+// to use this as the package will automatically select the most performant
+// implementation available.
+Implementation :: union {
+	aes.Implementation,
+	chacha20.Implementation,
+}
+
+// MAX_TAG_SIZE is the maximum size tag that can be returned by any of the
+// Algorithms supported via this package.
+MAX_TAG_SIZE :: 16
+
+// Algorithm is the algorithm identifier associated with a given Context.
+Algorithm :: enum {
+	Invalid,
+	AES_GCM_128,
+	AES_GCM_192,
+	AES_GCM_256,
+	CHACHA20POLY1305,
+	XCHACHA20POLY1305,
+}
+
+// ALGORITM_NAMES is the Agorithm to algorithm name string.
+ALGORITHM_NAMES := [Algorithm]string {
+	.Invalid           = "Invalid",
+	.AES_GCM_128       = "AES-GCM-128",
+	.AES_GCM_192       = "AES-GCM-192",
+	.AES_GCM_256       = "AES-GCM-256",
+	.CHACHA20POLY1305  = "chacha20poly1305",
+	.XCHACHA20POLY1305 = "xchacha20poly1305",
+}
+
+// TAG_SIZES is the Algorithm to tag size in bytes.
+TAG_SIZES := [Algorithm]int {
+	.Invalid           = 0,
+	.AES_GCM_128       = aes.GCM_TAG_SIZE,
+	.AES_GCM_192       = aes.GCM_TAG_SIZE,
+	.AES_GCM_256       = aes.GCM_TAG_SIZE,
+	.CHACHA20POLY1305  = chacha20poly1305.TAG_SIZE,
+	.XCHACHA20POLY1305 = chacha20poly1305.TAG_SIZE,
+}
+
+// KEY_SIZES is the Algorithm to key size in bytes.
+KEY_SIZES := [Algorithm]int {
+	.Invalid           = 0,
+	.AES_GCM_128       = aes.KEY_SIZE_128,
+	.AES_GCM_192       = aes.KEY_SIZE_192,
+	.AES_GCM_256       = aes.KEY_SIZE_256,
+	.CHACHA20POLY1305  = chacha20poly1305.KEY_SIZE,
+	.XCHACHA20POLY1305 = chacha20poly1305.KEY_SIZE,
+}
+
+// IV_SIZES is the Algorithm to initialization vector size in bytes.
+//
+// Note: Some algorithms (such as AES-GCM) support variable IV sizes.
+IV_SIZES := [Algorithm]int {
+	.Invalid           = 0,
+	.AES_GCM_128       = aes.GCM_IV_SIZE,
+	.AES_GCM_192       = aes.GCM_IV_SIZE,
+	.AES_GCM_256       = aes.GCM_IV_SIZE,
+	.CHACHA20POLY1305  = chacha20poly1305.IV_SIZE,
+	.XCHACHA20POLY1305 = chacha20poly1305.XIV_SIZE,
+}
+
+// Context is a concrete instantiation of a specific AEAD algorithm.
+Context :: struct {
+	_algo: Algorithm,
+	_impl: union {
+		aes.Context_GCM,
+		chacha20poly1305.Context,
+	},
+}
+
+@(private)
+_IMPL_IDS := [Algorithm]typeid {
+	.Invalid           = nil,
+	.AES_GCM_128       = typeid_of(aes.Context_GCM),
+	.AES_GCM_192       = typeid_of(aes.Context_GCM),
+	.AES_GCM_256       = typeid_of(aes.Context_GCM),
+	.CHACHA20POLY1305  = typeid_of(chacha20poly1305.Context),
+	.XCHACHA20POLY1305 = typeid_of(chacha20poly1305.Context),
+}
+
+// init initializes a Context with a specific AEAD Algorithm.
+init :: proc(ctx: ^Context, algorithm: Algorithm, key: []byte, impl: Implementation = nil) {
+	if ctx._impl != nil {
+		reset(ctx)
+	}
+
+	if len(key) != KEY_SIZES[algorithm] {
+		panic("crypto/aead: invalid key size")
+	}
+
+	// Directly specialize the union by setting the type ID (save a copy).
+	reflect.set_union_variant_typeid(
+		ctx._impl,
+		_IMPL_IDS[algorithm],
+	)
+	switch algorithm {
+	case .AES_GCM_128, .AES_GCM_192, .AES_GCM_256:
+		impl_ := impl != nil ? impl.(aes.Implementation) : aes.DEFAULT_IMPLEMENTATION
+		aes.init_gcm(&ctx._impl.(aes.Context_GCM), key, impl_)
+	case .CHACHA20POLY1305:
+		impl_ := impl != nil ? impl.(chacha20.Implementation) : chacha20.DEFAULT_IMPLEMENTATION
+		chacha20poly1305.init(&ctx._impl.(chacha20poly1305.Context), key, impl_)
+	case .XCHACHA20POLY1305:
+		impl_ := impl != nil ? impl.(chacha20.Implementation) : chacha20.DEFAULT_IMPLEMENTATION
+		chacha20poly1305.init_xchacha(&ctx._impl.(chacha20poly1305.Context), key, impl_)
+	case .Invalid:
+		panic("crypto/aead: uninitialized algorithm")
+	case:
+		panic("crypto/aead: invalid algorithm")
+	}
+
+	ctx._algo = algorithm
+}
+
+// seal_ctx encrypts the plaintext and authenticates the aad and ciphertext,
+// with the provided Context and iv, stores the output in dst and tag.
+//
+// dst and plaintext MUST alias exactly or not at all.
+seal_ctx :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {
+	switch &impl in ctx._impl {
+	case aes.Context_GCM:
+		aes.seal_gcm(&impl, dst, tag, iv, aad, plaintext)
+	case chacha20poly1305.Context:
+		chacha20poly1305.seal(&impl, dst, tag, iv, aad, plaintext)
+	case:
+		panic("crypto/aead: uninitialized algorithm")
+	}
+}
+
+// open_ctx authenticates the aad and ciphertext, and decrypts the ciphertext,
+// with the provided Context, iv, and tag, and stores the output in dst,
+// returning true iff the authentication was successful.  If authentication
+// fails, the destination buffer will be zeroed.
+//
+// dst and plaintext MUST alias exactly or not at all.
+@(require_results)
+open_ctx :: proc(ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool {
+	switch &impl in ctx._impl {
+	case aes.Context_GCM:
+		return aes.open_gcm(&impl, dst, iv, aad, ciphertext, tag)
+	case chacha20poly1305.Context:
+		return chacha20poly1305.open(&impl, dst, iv, aad, ciphertext, tag)
+	case:
+		panic("crypto/aead: uninitialized algorithm")
+	}
+}
+
+// reset sanitizes the Context.  The Context must be re-initialized to
+// be used again.
+reset :: proc(ctx: ^Context) {
+	switch &impl in ctx._impl {
+	case aes.Context_GCM:
+		aes.reset_gcm(&impl)
+	case chacha20poly1305.Context:
+		chacha20poly1305.reset(&impl)
+	case:
+		// Calling reset repeatedly is fine.
+	}
+
+	ctx._algo = .Invalid
+	ctx._impl = nil
+}
+
+// algorithm returns the Algorithm used by a Context instance.
+algorithm :: proc(ctx: ^Context) -> Algorithm {
+	return ctx._algo
+}
+
+// iv_size returns the IV size of a Context instance in bytes.
+iv_size :: proc(ctx: ^Context) -> int {
+	return IV_SIZES[ctx._algo]
+}
+
+// tag_size returns the tag size of a Context instance in bytes.
+tag_size :: proc(ctx: ^Context) -> int {
+	return TAG_SIZES[ctx._algo]
+}
@@ -2,9 +2,9 @@
 package aes implements the AES block cipher and some common modes.

 See:
- https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.197-upd1.pdf
- https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38a.pdf
- https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38d.pdf
+- [[ https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.197-upd1.pdf ]]
+- [[ https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38a.pdf ]]
+- [[ https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38d.pdf ]]
 */
 package aes

@@ -20,7 +20,7 @@ Context_CTR :: struct {
 }

 // init_ctr initializes a Context_CTR with the provided key and IV.
-init_ctr :: proc(ctx: ^Context_CTR, key, iv: []byte, impl := Implementation.Hardware) {
+init_ctr :: proc(ctx: ^Context_CTR, key, iv: []byte, impl := DEFAULT_IMPLEMENTATION) {
 	if len(iv) != CTR_IV_SIZE {
 		panic("crypto/aes: invalid CTR IV size")
 	}
@@ -47,7 +47,7 @@ xor_bytes_ctr :: proc(ctx: ^Context_CTR, dst, src: []byte) {
 		panic("crypto/aes: dst and src alias inexactly")
 	}

-	for remaining := len(src); remaining > 0; {
+	#no_bounds_check for remaining := len(src); remaining > 0; {
 		// Process multiple blocks at once
 		if ctx._off == BLOCK_SIZE {
 			if nr_blocks := remaining / BLOCK_SIZE; nr_blocks > 0 {
@@ -85,7 +85,7 @@ keystream_bytes_ctr :: proc(ctx: ^Context_CTR, dst: []byte) {
 	assert(ctx._is_initialized)

 	dst := dst
-	for remaining := len(dst); remaining > 0; {
+	#no_bounds_check for remaining := len(dst); remaining > 0; {
 		// Process multiple blocks at once
 		if ctx._off == BLOCK_SIZE {
 			if nr_blocks := remaining / BLOCK_SIZE; nr_blocks > 0 {
@@ -12,7 +12,7 @@ Context_ECB :: struct {
 }

 // init_ecb initializes a Context_ECB with the provided key.
-init_ecb :: proc(ctx: ^Context_ECB, key: []byte, impl := Implementation.Hardware) {
+init_ecb :: proc(ctx: ^Context_ECB, key: []byte, impl := DEFAULT_IMPLEMENTATION) {
 	init_impl(&ctx._impl, key, impl)
 	ctx._is_initialized = true
 }
@@ -7,10 +7,10 @@ import "core:crypto/_aes/ct64"
 import "core:encoding/endian"
 import "core:mem"

-// GCM_NONCE_SIZE is the default size of the GCM nonce in bytes.
-GCM_NONCE_SIZE :: 12
-// GCM_NONCE_SIZE_MAX is the maximum size of the GCM nonce in bytes.
-GCM_NONCE_SIZE_MAX :: 0x2000000000000000 // floor((2^64 - 1) / 8) bits
+// GCM_IV_SIZE is the default size of the GCM IV in bytes.
+GCM_IV_SIZE :: 12
+// GCM_IV_SIZE_MAX is the maximum size of the GCM IV in bytes.
+GCM_IV_SIZE_MAX :: 0x2000000000000000 // floor((2^64 - 1) / 8) bits
 // GCM_TAG_SIZE is the size of a GCM tag in bytes.
 GCM_TAG_SIZE :: _aes.GHASH_TAG_SIZE

@@ -26,19 +26,19 @@ Context_GCM :: struct {
 }

 // init_gcm initializes a Context_GCM with the provided key.
-init_gcm :: proc(ctx: ^Context_GCM, key: []byte, impl := Implementation.Hardware) {
+init_gcm :: proc(ctx: ^Context_GCM, key: []byte, impl := DEFAULT_IMPLEMENTATION) {
 	init_impl(&ctx._impl, key, impl)
 	ctx._is_initialized = true
 }

 // seal_gcm encrypts the plaintext and authenticates the aad and ciphertext,
-// with the provided Context_GCM and nonce, stores the output in dst and tag.
+// with the provided Context_GCM and iv, stores the output in dst and tag.
 //
 // dst and plaintext MUST alias exactly or not at all.
-seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, nonce, aad, plaintext: []byte) {
+seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, iv, aad, plaintext: []byte) {
 	assert(ctx._is_initialized)

-	gcm_validate_common_slice_sizes(tag, nonce, aad, plaintext)
+	gcm_validate_common_slice_sizes(tag, iv, aad, plaintext)
 	if len(dst) != len(plaintext) {
 		panic("crypto/aes: invalid destination ciphertext size")
 	}
@@ -47,7 +47,7 @@ seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, nonce, aad, plaintext: []byte) {
 	}

 	if impl, is_hw := ctx._impl.(Context_Impl_Hardware); is_hw {
-		gcm_seal_hw(&impl, dst, tag, nonce, aad, plaintext)
+		gcm_seal_hw(&impl, dst, tag, iv, aad, plaintext)
 		return
 	}

@@ -55,7 +55,7 @@ seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, nonce, aad, plaintext: []byte) {
 	j0: [_aes.GHASH_BLOCK_SIZE]byte
 	j0_enc: [_aes.GHASH_BLOCK_SIZE]byte
 	s: [_aes.GHASH_TAG_SIZE]byte
-	init_ghash_ct64(ctx, &h, &j0, &j0_enc, nonce)
+	init_ghash_ct64(ctx, &h, &j0, &j0_enc, iv)

 	// Note: Our GHASH implementation handles appending padding.
 	ct64.ghash(s[:], h[:], aad)
@@ -69,15 +69,16 @@ seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, nonce, aad, plaintext: []byte) {
 }

 // open_gcm authenticates the aad and ciphertext, and decrypts the ciphertext,
-// with the provided Context_GCM, nonce, and tag, and stores the output in dst,
+// with the provided Context_GCM, iv, and tag, and stores the output in dst,
 // returning true iff the authentication was successful.  If authentication
 // fails, the destination buffer will be zeroed.
 //
 // dst and plaintext MUST alias exactly or not at all.
-open_gcm :: proc(ctx: ^Context_GCM, dst, nonce, aad, ciphertext, tag: []byte) -> bool {
+@(require_results)
+open_gcm :: proc(ctx: ^Context_GCM, dst, iv, aad, ciphertext, tag: []byte) -> bool {
 	assert(ctx._is_initialized)

-	gcm_validate_common_slice_sizes(tag, nonce, aad, ciphertext)
+	gcm_validate_common_slice_sizes(tag, iv, aad, ciphertext)
 	if len(dst) != len(ciphertext) {
 		panic("crypto/aes: invalid destination plaintext size")
 	}
@@ -86,14 +87,14 @@ open_gcm :: proc(ctx: ^Context_GCM, dst, nonce, aad, ciphertext, tag: []byte) ->
 	}

 	if impl, is_hw := ctx._impl.(Context_Impl_Hardware); is_hw {
-		return gcm_open_hw(&impl, dst, nonce, aad, ciphertext, tag)
+		return gcm_open_hw(&impl, dst, iv, aad, ciphertext, tag)
 	}

 	h: [_aes.GHASH_KEY_SIZE]byte
 	j0: [_aes.GHASH_BLOCK_SIZE]byte
 	j0_enc: [_aes.GHASH_BLOCK_SIZE]byte
 	s: [_aes.GHASH_TAG_SIZE]byte
-	init_ghash_ct64(ctx, &h, &j0, &j0_enc, nonce)
+	init_ghash_ct64(ctx, &h, &j0, &j0_enc, iv)

 	ct64.ghash(s[:], h[:], aad)
 	gctr_ct64(ctx, dst, &s, ciphertext, &h, &j0, false)
@@ -112,7 +113,7 @@ open_gcm :: proc(ctx: ^Context_GCM, dst, nonce, aad, ciphertext, tag: []byte) ->
 	return ok
 }

-// reset_ctr sanitizes the Context_GCM.  The Context_GCM must be
+// reset_gcm sanitizes the Context_GCM.  The Context_GCM must be
 // re-initialized to be used again.
 reset_gcm :: proc "contextless" (ctx: ^Context_GCM) {
 	reset_impl(&ctx._impl)
@@ -120,14 +121,14 @@ reset_gcm :: proc "contextless" (ctx: ^Context_GCM) {
 }

@(private = "file")
-gcm_validate_common_slice_sizes :: proc(tag, nonce, aad, text: []byte) {
+gcm_validate_common_slice_sizes :: proc(tag, iv, aad, text: []byte) {
 	if len(tag) != GCM_TAG_SIZE {
 		panic("crypto/aes: invalid GCM tag size")
 	}

-	// The specification supports nonces in the range [1, 2^64) bits.
-	if l := len(nonce); l == 0 || u64(l) >= GCM_NONCE_SIZE_MAX {
-		panic("crypto/aes: invalid GCM nonce size")
+	// The specification supports IVs in the range [1, 2^64) bits.
+	if l := len(iv); l == 0 || u64(l) >= GCM_IV_SIZE_MAX {
+		panic("crypto/aes: invalid GCM IV size")
 	}

 	if aad_len := u64(len(aad)); aad_len > GCM_A_MAX {
@@ -144,7 +145,7 @@ init_ghash_ct64 :: proc(
 	h: ^[_aes.GHASH_KEY_SIZE]byte,
 	j0: ^[_aes.GHASH_BLOCK_SIZE]byte,
 	j0_enc: ^[_aes.GHASH_BLOCK_SIZE]byte,
-	nonce: []byte,
+	iv: []byte,
 ) {
 	impl := &ctx._impl.(ct64.Context)

@@ -152,14 +153,14 @@ init_ghash_ct64 :: proc(
 	ct64.encrypt_block(impl, h[:], h[:])

 	// Define a block, J0, as follows:
-	if l := len(nonce); l == GCM_NONCE_SIZE {
+	if l := len(iv); l == GCM_IV_SIZE {
 		// if len(IV) = 96, then let J0 = IV || 0^31 || 1
-		copy(j0[:], nonce)
+		copy(j0[:], iv)
 		j0[_aes.GHASH_BLOCK_SIZE - 1] = 1
 	} else {
 		// If len(IV) != 96, then let s = 128 ceil(len(IV)/128) - len(IV),
 		// and let J0 = GHASHH(IV || 0^(s+64) || ceil(len(IV))^64).
-		ct64.ghash(j0[:], h[:], nonce)
+		ct64.ghash(j0[:], h[:], iv)

 		tmp: [_aes.GHASH_BLOCK_SIZE]byte
 		endian.unchecked_put_u64be(tmp[8:], u64(l) * 8)
@@ -197,7 +198,7 @@ gctr_ct64 :: proc(
 	s: ^[_aes.GHASH_BLOCK_SIZE]byte,
 	src: []byte,
 	h: ^[_aes.GHASH_KEY_SIZE]byte,
-	nonce: ^[_aes.GHASH_BLOCK_SIZE]byte,
+	iv: ^[_aes.GHASH_BLOCK_SIZE]byte,
 	is_seal: bool,
 ) #no_bounds_check {
 	ct64_inc_ctr32 := #force_inline proc "contextless" (dst: []byte, ctr: u32) -> u32 {
@@ -208,14 +209,14 @@ gctr_ct64 :: proc(
 	// Setup the counter blocks.
 	tmp, tmp2: [ct64.STRIDE][BLOCK_SIZE]byte = ---, ---
 	ctrs, blks: [ct64.STRIDE][]byte = ---, ---
-	ctr := endian.unchecked_get_u32be(nonce[GCM_NONCE_SIZE:]) + 1
+	ctr := endian.unchecked_get_u32be(iv[GCM_IV_SIZE:]) + 1
 	for i in 0 ..< ct64.STRIDE {
 		// Setup scratch space for the keystream.
 		blks[i] = tmp2[i][:]

 		// Pre-copy the IV to all the counter blocks.
 		ctrs[i] = tmp[i][:]
-		copy(ctrs[i], nonce[:GCM_NONCE_SIZE])
+		copy(ctrs[i], iv[:GCM_IV_SIZE])
 	}

 	impl := &ctx._impl.(ct64.Context)
@@ -10,12 +10,12 @@ import "core:mem"
 import "core:simd/x86"

@(private)
-gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, nonce, aad, plaintext: []byte) {
+gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, iv, aad, plaintext: []byte) {
 	h: [_aes.GHASH_KEY_SIZE]byte
 	j0: [_aes.GHASH_BLOCK_SIZE]byte
 	j0_enc: [_aes.GHASH_BLOCK_SIZE]byte
 	s: [_aes.GHASH_TAG_SIZE]byte
-	init_ghash_hw(ctx, &h, &j0, &j0_enc, nonce)
+	init_ghash_hw(ctx, &h, &j0, &j0_enc, iv)

 	// Note: Our GHASH implementation handles appending padding.
 	hw_intel.ghash(s[:], h[:], aad)
@@ -29,12 +29,12 @@ gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, nonce, aad, plaintext
 }

@(private)
-gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, nonce, aad, ciphertext, tag: []byte) -> bool {
+gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, iv, aad, ciphertext, tag: []byte) -> bool {
 	h: [_aes.GHASH_KEY_SIZE]byte
 	j0: [_aes.GHASH_BLOCK_SIZE]byte
 	j0_enc: [_aes.GHASH_BLOCK_SIZE]byte
 	s: [_aes.GHASH_TAG_SIZE]byte
-	init_ghash_hw(ctx, &h, &j0, &j0_enc, nonce)
+	init_ghash_hw(ctx, &h, &j0, &j0_enc, iv)

 	hw_intel.ghash(s[:], h[:], aad)
 	gctr_hw(ctx, dst, &s, ciphertext, &h, &j0, false)
@@ -59,20 +59,20 @@ init_ghash_hw :: proc(
 	h: ^[_aes.GHASH_KEY_SIZE]byte,
 	j0: ^[_aes.GHASH_BLOCK_SIZE]byte,
 	j0_enc: ^[_aes.GHASH_BLOCK_SIZE]byte,
-	nonce: []byte,
+	iv: []byte,
 ) {
 	// 1. Let H = CIPH(k, 0^128)
 	encrypt_block_hw(ctx, h[:], h[:])

 	// Define a block, J0, as follows:
-	if l := len(nonce); l == GCM_NONCE_SIZE {
+	if l := len(iv); l == GCM_IV_SIZE {
 		// if len(IV) = 96, then let J0 = IV || 0^31 || 1
-		copy(j0[:], nonce)
+		copy(j0[:], iv)
 		j0[_aes.GHASH_BLOCK_SIZE - 1] = 1
 	} else {
 		// If len(IV) != 96, then let s = 128 ceil(len(IV)/128) - len(IV),
 		// and let J0 = GHASHH(IV || 0^(s+64) || ceil(len(IV))^64).
-		hw_intel.ghash(j0[:], h[:], nonce)
+		hw_intel.ghash(j0[:], h[:], iv)

 		tmp: [_aes.GHASH_BLOCK_SIZE]byte
 		endian.unchecked_put_u64be(tmp[8:], u64(l) * 8)
@@ -109,7 +109,7 @@ gctr_hw :: proc(
 	s: ^[_aes.GHASH_BLOCK_SIZE]byte,
 	src: []byte,
 	h: ^[_aes.GHASH_KEY_SIZE]byte,
-	nonce: ^[_aes.GHASH_BLOCK_SIZE]byte,
+	iv: ^[_aes.GHASH_BLOCK_SIZE]byte,
 	is_seal: bool,
 ) #no_bounds_check {
 	sks: [15]x86.__m128i = ---
@@ -118,8 +118,8 @@ gctr_hw :: proc(
 	}

 	// Setup the counter block
-	ctr_blk := intrinsics.unaligned_load((^x86.__m128i)(nonce))
-	ctr := endian.unchecked_get_u32be(nonce[GCM_NONCE_SIZE:]) + 1
+	ctr_blk := intrinsics.unaligned_load((^x86.__m128i)(iv))
+	ctr := endian.unchecked_get_u32be(iv[GCM_IV_SIZE:]) + 1

 	src, dst := src, dst

@@ -10,6 +10,10 @@ Context_Impl :: union {
 	Context_Impl_Hardware,
 }

+// DEFAULT_IMPLEMENTATION is the implementation that will be used by
+// default if possible.
+DEFAULT_IMPLEMENTATION :: Implementation.Hardware
+
 // Implementation is an AES implementation.  Most callers will not need
 // to use this as the package will automatically select the most performant
 // implementation available (See `is_hardware_accelerated()`).
@@ -34,11 +34,11 @@ ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) {
 }

@(private)
-gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, nonce, aad, plaintext: []byte) {
+gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, iv, aad, plaintext: []byte) {
 	panic(ERR_HW_NOT_SUPPORTED)
 }

@(private)
-gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, nonce, aad, ciphertext, tag: []byte) -> bool {
+gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, iv, aad, ciphertext, tag: []byte) -> bool {
 	panic(ERR_HW_NOT_SUPPORTED)
 }
@@ -2,8 +2,8 @@
 package blake2b implements the BLAKE2b hash algorithm.

 See:
- https://datatracker.ietf.org/doc/html/rfc7693
- https://www.blake2.net
+- [[ https://datatracker.ietf.org/doc/html/rfc7693 ]]
+- [[ https://www.blake2.net ]]
 */
 package blake2b

@@ -2,8 +2,8 @@
 package blake2s implements the BLAKE2s hash algorithm.

 See:
- https://datatracker.ietf.org/doc/html/rfc7693
- https://www.blake2.net/
+- [[ https://datatracker.ietf.org/doc/html/rfc7693 ]]
+- [[ https://www.blake2.net/ ]]
 */
 package blake2s

@@ -2,125 +2,72 @@
 package chacha20 implements the ChaCha20 and XChaCha20 stream ciphers.

 See:
- https://datatracker.ietf.org/doc/html/rfc8439
- https://datatracker.ietf.org/doc/draft-irtf-cfrg-xchacha/03/
+- [[ https://datatracker.ietf.org/doc/html/rfc8439 ]]
+- [[ https://datatracker.ietf.org/doc/draft-irtf-cfrg-xchacha/03/ ]]
 */
 package chacha20

 import "core:bytes"
-import "core:encoding/endian"
-import "core:math/bits"
+import "core:crypto/_chacha20"
 import "core:mem"

 // KEY_SIZE is the (X)ChaCha20 key size in bytes.
-KEY_SIZE :: 32
-// NONCE_SIZE is the ChaCha20 nonce size in bytes.
-NONCE_SIZE :: 12
-// XNONCE_SIZE is the XChaCha20 nonce size in bytes.
-XNONCE_SIZE :: 24
-
-@(private)
-_MAX_CTR_IETF :: 0xffffffff
-
-@(private)
-_BLOCK_SIZE :: 64
-@(private)
-_STATE_SIZE_U32 :: 16
-@(private)
-_ROUNDS :: 20
-
-@(private)
-_SIGMA_0: u32 : 0x61707865
-@(private)
-_SIGMA_1: u32 : 0x3320646e
-@(private)
-_SIGMA_2: u32 : 0x79622d32
-@(private)
-_SIGMA_3: u32 : 0x6b206574
+KEY_SIZE :: _chacha20.KEY_SIZE
+// IV_SIZE is the ChaCha20 IV size in bytes.
+IV_SIZE :: _chacha20.IV_SIZE
+// XIV_SIZE is the XChaCha20 IV size in bytes.
+XIV_SIZE :: _chacha20.XIV_SIZE

 // Context is a ChaCha20 or XChaCha20 instance.
 Context :: struct {
-	_s:              [_STATE_SIZE_U32]u32,
-	_buffer:         [_BLOCK_SIZE]byte,
-	_off:            int,
-	_is_ietf_flavor: bool,
-	_is_initialized: bool,
+	_state: _chacha20.Context,
+	_impl:  Implementation,
 }

 // init inititializes a Context for ChaCha20 or XChaCha20 with the provided
-// key and nonce.
-init :: proc(ctx: ^Context, key, nonce: []byte) {
+// key and iv.
+init :: proc(ctx: ^Context, key, iv: []byte, impl := DEFAULT_IMPLEMENTATION) {
 	if len(key) != KEY_SIZE {
-		panic("crypto/chacha20: invalid ChaCha20 key size")
+		panic("crypto/chacha20: invalid (X)ChaCha20 key size")
 	}
-	if n_len := len(nonce); n_len != NONCE_SIZE && n_len != XNONCE_SIZE {
-		panic("crypto/chacha20: invalid (X)ChaCha20 nonce size")
+	if l := len(iv); l != IV_SIZE && l != XIV_SIZE {
+		panic("crypto/chacha20: invalid (X)ChaCha20 IV size")
 	}

-	k, n := key, nonce
+	k, n := key, iv

-	// Derive the XChaCha20 subkey and sub-nonce via HChaCha20.
-	is_xchacha := len(nonce) == XNONCE_SIZE
+	init_impl(ctx, impl)
+
+	is_xchacha := len(iv) == XIV_SIZE
 	if is_xchacha {
-		sub_key := ctx._buffer[:KEY_SIZE]
-		_hchacha20(sub_key, k, n)
+		sub_iv: [IV_SIZE]byte
+		sub_key := ctx._state._buffer[:KEY_SIZE]
+		hchacha20(sub_key, k, n, ctx._impl)
 		k = sub_key
-		n = n[16:24]
+		copy(sub_iv[4:], n[16:])
+		n = sub_iv[:]
 	}

-	ctx._s[0] = _SIGMA_0
-	ctx._s[1] = _SIGMA_1
-	ctx._s[2] = _SIGMA_2
-	ctx._s[3] = _SIGMA_3
-	ctx._s[4] = endian.unchecked_get_u32le(k[0:4])
-	ctx._s[5] = endian.unchecked_get_u32le(k[4:8])
-	ctx._s[6] = endian.unchecked_get_u32le(k[8:12])
-	ctx._s[7] = endian.unchecked_get_u32le(k[12:16])
-	ctx._s[8] = endian.unchecked_get_u32le(k[16:20])
-	ctx._s[9] = endian.unchecked_get_u32le(k[20:24])
-	ctx._s[10] = endian.unchecked_get_u32le(k[24:28])
-	ctx._s[11] = endian.unchecked_get_u32le(k[28:32])
-	ctx._s[12] = 0
-	if !is_xchacha {
-		ctx._s[13] = endian.unchecked_get_u32le(n[0:4])
-		ctx._s[14] = endian.unchecked_get_u32le(n[4:8])
-		ctx._s[15] = endian.unchecked_get_u32le(n[8:12])
-	} else {
-		ctx._s[13] = 0
-		ctx._s[14] = endian.unchecked_get_u32le(n[0:4])
-		ctx._s[15] = endian.unchecked_get_u32le(n[4:8])
+	_chacha20.init(&ctx._state, k, n, is_xchacha)

+	if is_xchacha {
 		// The sub-key is stored in the keystream buffer.  While
 		// this will be overwritten in most circumstances, explicitly
 		// clear it out early.
-		mem.zero_explicit(&ctx._buffer, KEY_SIZE)
+		mem.zero_explicit(&ctx._state._buffer, KEY_SIZE)
 	}
-
-	ctx._off = _BLOCK_SIZE
-	ctx._is_ietf_flavor = !is_xchacha
-	ctx._is_initialized = true
 }

 // seek seeks the (X)ChaCha20 stream counter to the specified block.
 seek :: proc(ctx: ^Context, block_nr: u64) {
-	assert(ctx._is_initialized)
-
-	if ctx._is_ietf_flavor {
-		if block_nr > _MAX_CTR_IETF {
-			panic("crypto/chacha20: attempted to seek past maximum counter")
-		}
-	} else {
-		ctx._s[13] = u32(block_nr >> 32)
-	}
-	ctx._s[12] = u32(block_nr)
-	ctx._off = _BLOCK_SIZE
+	_chacha20.seek(&ctx._state, block_nr)
 }

 // xor_bytes XORs each byte in src with bytes taken from the (X)ChaCha20
 // keystream, and writes the resulting output to dst.  Dst and src MUST
 // alias exactly or not at all.
 xor_bytes :: proc(ctx: ^Context, dst, src: []byte) {
-	assert(ctx._is_initialized)
+	assert(ctx._state._is_initialized)

 	src, dst := src, dst
 	if dst_len := len(dst); dst_len < len(src) {
@@ -131,12 +78,13 @@ xor_bytes :: proc(ctx: ^Context, dst, src: []byte) {
 		panic("crypto/chacha20: dst and src alias inexactly")
 	}

-	for remaining := len(src); remaining > 0; {
+	st := &ctx._state
+	#no_bounds_check for remaining := len(src); remaining > 0; {
 		// Process multiple blocks at once
-		if ctx._off == _BLOCK_SIZE {
-			if nr_blocks := remaining / _BLOCK_SIZE; nr_blocks > 0 {
-				direct_bytes := nr_blocks * _BLOCK_SIZE
-				_do_blocks(ctx, dst, src, nr_blocks)
+		if st._off == _chacha20.BLOCK_SIZE {
+			if nr_blocks := remaining / _chacha20.BLOCK_SIZE; nr_blocks > 0 {
+				direct_bytes := nr_blocks * _chacha20.BLOCK_SIZE
+				stream_blocks(ctx, dst, src, nr_blocks)
 				remaining -= direct_bytes
 				if remaining == 0 {
 					return
@@ -147,17 +95,17 @@ xor_bytes :: proc(ctx: ^Context, dst, src: []byte) {

 			// If there is a partial block, generate and buffer 1 block
 			// worth of keystream.
-			_do_blocks(ctx, ctx._buffer[:], nil, 1)
-			ctx._off = 0
+			stream_blocks(ctx, st._buffer[:], nil, 1)
+			st._off = 0
 		}

 		// Process partial blocks from the buffered keystream.
-		to_xor := min(_BLOCK_SIZE - ctx._off, remaining)
-		buffered_keystream := ctx._buffer[ctx._off:]
+		to_xor := min(_chacha20.BLOCK_SIZE - st._off, remaining)
+		buffered_keystream := st._buffer[st._off:]
 		for i := 0; i < to_xor; i = i + 1 {
 			dst[i] = buffered_keystream[i] ~ src[i]
 		}
-		ctx._off += to_xor
+		st._off += to_xor
 		dst = dst[to_xor:]
 		src = src[to_xor:]
 		remaining -= to_xor
@@ -166,15 +114,15 @@ xor_bytes :: proc(ctx: ^Context, dst, src: []byte) {

 // keystream_bytes fills dst with the raw (X)ChaCha20 keystream output.
 keystream_bytes :: proc(ctx: ^Context, dst: []byte) {
-	assert(ctx._is_initialized)
+	assert(ctx._state._is_initialized)

-	dst := dst
-	for remaining := len(dst); remaining > 0; {
+	dst, st := dst, &ctx._state
+	#no_bounds_check for remaining := len(dst); remaining > 0; {
 		// Process multiple blocks at once
-		if ctx._off == _BLOCK_SIZE {
-			if nr_blocks := remaining / _BLOCK_SIZE; nr_blocks > 0 {
-				direct_bytes := nr_blocks * _BLOCK_SIZE
-				_do_blocks(ctx, dst, nil, nr_blocks)
+		if st._off == _chacha20.BLOCK_SIZE {
+			if nr_blocks := remaining / _chacha20.BLOCK_SIZE; nr_blocks > 0 {
+				direct_bytes := nr_blocks * _chacha20.BLOCK_SIZE
+				stream_blocks(ctx, dst, nil, nr_blocks)
 				remaining -= direct_bytes
 				if remaining == 0 {
 					return
@@ -184,15 +132,15 @@ keystream_bytes :: proc(ctx: ^Context, dst: []byte) {

 			// If there is a partial block, generate and buffer 1 block
 			// worth of keystream.
-			_do_blocks(ctx, ctx._buffer[:], nil, 1)
-			ctx._off = 0
+			stream_blocks(ctx, st._buffer[:], nil, 1)
+			st._off = 0
 		}

 		// Process partial blocks from the buffered keystream.
-		to_copy := min(_BLOCK_SIZE - ctx._off, remaining)
-		buffered_keystream := ctx._buffer[ctx._off:]
+		to_copy := min(_chacha20.BLOCK_SIZE - st._off, remaining)
+		buffered_keystream := st._buffer[st._off:]
 		copy(dst[:to_copy], buffered_keystream[:to_copy])
-		ctx._off += to_copy
+		st._off += to_copy
 		dst = dst[to_copy:]
 		remaining -= to_copy
 	}
@@ -201,366 +149,5 @@ keystream_bytes :: proc(ctx: ^Context, dst: []byte) {
 // reset sanitizes the Context.  The Context must be re-initialized to
 // be used again.
 reset :: proc(ctx: ^Context) {
-	mem.zero_explicit(&ctx._s, size_of(ctx._s))
-	mem.zero_explicit(&ctx._buffer, size_of(ctx._buffer))
-
-	ctx._is_initialized = false
-}
-
-@(private)
-_do_blocks :: proc(ctx: ^Context, dst, src: []byte, nr_blocks: int) {
-	// Enforce the maximum consumed keystream per nonce.
-	//
-	// While all modern "standard" definitions of ChaCha20 use
-	// the IETF 32-bit counter, for XChaCha20 most common
-	// implementations allow for a 64-bit counter.
-	//
-	// Honestly, the answer here is "use a MRAE primitive", but
-	// go with common practice in the case of XChaCha20.
-	if ctx._is_ietf_flavor {
-		if u64(ctx._s[12]) + u64(nr_blocks) > 0xffffffff {
-			panic("crypto/chacha20: maximum ChaCha20 keystream per nonce reached")
-		}
-	} else {
-		ctr := (u64(ctx._s[13]) << 32) | u64(ctx._s[12])
-		if _, carry := bits.add_u64(ctr, u64(nr_blocks), 0); carry != 0 {
-			panic("crypto/chacha20: maximum XChaCha20 keystream per nonce reached")
-		}
-	}
-
-	dst, src := dst, src
-	x := &ctx._s
-	for n := 0; n < nr_blocks; n = n + 1 {
-		x0, x1, x2, x3 := _SIGMA_0, _SIGMA_1, _SIGMA_2, _SIGMA_3
-		x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
-
-		for i := _ROUNDS; i > 0; i = i - 2 {
-			// Even when forcing inlining manually inlining all of
-			// these is decently faster.
-
-			// quarterround(x, 0, 4, 8, 12)
-			x0 += x4
-			x12 ~= x0
-			x12 = bits.rotate_left32(x12, 16)
-			x8 += x12
-			x4 ~= x8
-			x4 = bits.rotate_left32(x4, 12)
-			x0 += x4
-			x12 ~= x0
-			x12 = bits.rotate_left32(x12, 8)
-			x8 += x12
-			x4 ~= x8
-			x4 = bits.rotate_left32(x4, 7)
-
-			// quarterround(x, 1, 5, 9, 13)
-			x1 += x5
-			x13 ~= x1
-			x13 = bits.rotate_left32(x13, 16)
-			x9 += x13
-			x5 ~= x9
-			x5 = bits.rotate_left32(x5, 12)
-			x1 += x5
-			x13 ~= x1
-			x13 = bits.rotate_left32(x13, 8)
-			x9 += x13
-			x5 ~= x9
-			x5 = bits.rotate_left32(x5, 7)
-
-			// quarterround(x, 2, 6, 10, 14)
-			x2 += x6
-			x14 ~= x2
-			x14 = bits.rotate_left32(x14, 16)
-			x10 += x14
-			x6 ~= x10
-			x6 = bits.rotate_left32(x6, 12)
-			x2 += x6
-			x14 ~= x2
-			x14 = bits.rotate_left32(x14, 8)
-			x10 += x14
-			x6 ~= x10
-			x6 = bits.rotate_left32(x6, 7)
-
-			// quarterround(x, 3, 7, 11, 15)
-			x3 += x7
-			x15 ~= x3
-			x15 = bits.rotate_left32(x15, 16)
-			x11 += x15
-			x7 ~= x11
-			x7 = bits.rotate_left32(x7, 12)
-			x3 += x7
-			x15 ~= x3
-			x15 = bits.rotate_left32(x15, 8)
-			x11 += x15
-			x7 ~= x11
-			x7 = bits.rotate_left32(x7, 7)
-
-			// quarterround(x, 0, 5, 10, 15)
-			x0 += x5
-			x15 ~= x0
-			x15 = bits.rotate_left32(x15, 16)
-			x10 += x15
-			x5 ~= x10
-			x5 = bits.rotate_left32(x5, 12)
-			x0 += x5
-			x15 ~= x0
-			x15 = bits.rotate_left32(x15, 8)
-			x10 += x15
-			x5 ~= x10
-			x5 = bits.rotate_left32(x5, 7)
-
-			// quarterround(x, 1, 6, 11, 12)
-			x1 += x6
-			x12 ~= x1
-			x12 = bits.rotate_left32(x12, 16)
-			x11 += x12
-			x6 ~= x11
-			x6 = bits.rotate_left32(x6, 12)
-			x1 += x6
-			x12 ~= x1
-			x12 = bits.rotate_left32(x12, 8)
-			x11 += x12
-			x6 ~= x11
-			x6 = bits.rotate_left32(x6, 7)
-
-			// quarterround(x, 2, 7, 8, 13)
-			x2 += x7
-			x13 ~= x2
-			x13 = bits.rotate_left32(x13, 16)
-			x8 += x13
-			x7 ~= x8
-			x7 = bits.rotate_left32(x7, 12)
-			x2 += x7
-			x13 ~= x2
-			x13 = bits.rotate_left32(x13, 8)
-			x8 += x13
-			x7 ~= x8
-			x7 = bits.rotate_left32(x7, 7)
-
-			// quarterround(x, 3, 4, 9, 14)
-			x3 += x4
-			x14 ~= x3
-			x14 = bits.rotate_left32(x14, 16)
-			x9 += x14
-			x4 ~= x9
-			x4 = bits.rotate_left32(x4, 12)
-			x3 += x4
-			x14 ~= x3
-			x14 = bits.rotate_left32(x14, 8)
-			x9 += x14
-			x4 ~= x9
-			x4 = bits.rotate_left32(x4, 7)
-		}
-
-		x0 += _SIGMA_0
-		x1 += _SIGMA_1
-		x2 += _SIGMA_2
-		x3 += _SIGMA_3
-		x4 += x[4]
-		x5 += x[5]
-		x6 += x[6]
-		x7 += x[7]
-		x8 += x[8]
-		x9 += x[9]
-		x10 += x[10]
-		x11 += x[11]
-		x12 += x[12]
-		x13 += x[13]
-		x14 += x[14]
-		x15 += x[15]
-
-		// While the "correct" answer to getting more performance out of
-		// this is "use vector operations", support for that is currently
-		// a work in progress/to be designed.
-		//
-		// In the meantime:
-		// - The caller(s) ensure that src/dst are valid.
-		// - The compiler knows if the target is picky about alignment.
-
-		#no_bounds_check {
-			if src != nil {
-				endian.unchecked_put_u32le(dst[0:4], endian.unchecked_get_u32le(src[0:4]) ~ x0)
-				endian.unchecked_put_u32le(dst[4:8], endian.unchecked_get_u32le(src[4:8]) ~ x1)
-				endian.unchecked_put_u32le(dst[8:12], endian.unchecked_get_u32le(src[8:12]) ~ x2)
-				endian.unchecked_put_u32le(dst[12:16], endian.unchecked_get_u32le(src[12:16]) ~ x3)
-				endian.unchecked_put_u32le(dst[16:20], endian.unchecked_get_u32le(src[16:20]) ~ x4)
-				endian.unchecked_put_u32le(dst[20:24], endian.unchecked_get_u32le(src[20:24]) ~ x5)
-				endian.unchecked_put_u32le(dst[24:28], endian.unchecked_get_u32le(src[24:28]) ~ x6)
-				endian.unchecked_put_u32le(dst[28:32], endian.unchecked_get_u32le(src[28:32]) ~ x7)
-				endian.unchecked_put_u32le(dst[32:36], endian.unchecked_get_u32le(src[32:36]) ~ x8)
-				endian.unchecked_put_u32le(dst[36:40], endian.unchecked_get_u32le(src[36:40]) ~ x9)
-				endian.unchecked_put_u32le(dst[40:44], endian.unchecked_get_u32le(src[40:44]) ~ x10)
-				endian.unchecked_put_u32le(dst[44:48], endian.unchecked_get_u32le(src[44:48]) ~ x11)
-				endian.unchecked_put_u32le(dst[48:52], endian.unchecked_get_u32le(src[48:52]) ~ x12)
-				endian.unchecked_put_u32le(dst[52:56], endian.unchecked_get_u32le(src[52:56]) ~ x13)
-				endian.unchecked_put_u32le(dst[56:60], endian.unchecked_get_u32le(src[56:60]) ~ x14)
-				endian.unchecked_put_u32le(dst[60:64], endian.unchecked_get_u32le(src[60:64]) ~ x15)
-				src = src[_BLOCK_SIZE:]
-			} else {
-				endian.unchecked_put_u32le(dst[0:4], x0)
-				endian.unchecked_put_u32le(dst[4:8], x1)
-				endian.unchecked_put_u32le(dst[8:12], x2)
-				endian.unchecked_put_u32le(dst[12:16], x3)
-				endian.unchecked_put_u32le(dst[16:20], x4)
-				endian.unchecked_put_u32le(dst[20:24], x5)
-				endian.unchecked_put_u32le(dst[24:28], x6)
-				endian.unchecked_put_u32le(dst[28:32], x7)
-				endian.unchecked_put_u32le(dst[32:36], x8)
-				endian.unchecked_put_u32le(dst[36:40], x9)
-				endian.unchecked_put_u32le(dst[40:44], x10)
-				endian.unchecked_put_u32le(dst[44:48], x11)
-				endian.unchecked_put_u32le(dst[48:52], x12)
-				endian.unchecked_put_u32le(dst[52:56], x13)
-				endian.unchecked_put_u32le(dst[56:60], x14)
-				endian.unchecked_put_u32le(dst[60:64], x15)
-			}
-			dst = dst[_BLOCK_SIZE:]
-		}
-
-		// Increment the counter.  Overflow checking is done upon
-		// entry into the routine, so a 64-bit increment safely
-		// covers both cases.
-		new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + 1
-		x[12] = u32(new_ctr)
-		x[13] = u32(new_ctr >> 32)
-	}
-}
-
-@(private)
-_hchacha20 :: proc "contextless" (dst, key, nonce: []byte) {
-	x0, x1, x2, x3 := _SIGMA_0, _SIGMA_1, _SIGMA_2, _SIGMA_3
-	x4 := endian.unchecked_get_u32le(key[0:4])
-	x5 := endian.unchecked_get_u32le(key[4:8])
-	x6 := endian.unchecked_get_u32le(key[8:12])
-	x7 := endian.unchecked_get_u32le(key[12:16])
-	x8 := endian.unchecked_get_u32le(key[16:20])
-	x9 := endian.unchecked_get_u32le(key[20:24])
-	x10 := endian.unchecked_get_u32le(key[24:28])
-	x11 := endian.unchecked_get_u32le(key[28:32])
-	x12 := endian.unchecked_get_u32le(nonce[0:4])
-	x13 := endian.unchecked_get_u32le(nonce[4:8])
-	x14 := endian.unchecked_get_u32le(nonce[8:12])
-	x15 := endian.unchecked_get_u32le(nonce[12:16])
-
-	for i := _ROUNDS; i > 0; i = i - 2 {
-		// quarterround(x, 0, 4, 8, 12)
-		x0 += x4
-		x12 ~= x0
-		x12 = bits.rotate_left32(x12, 16)
-		x8 += x12
-		x4 ~= x8
-		x4 = bits.rotate_left32(x4, 12)
-		x0 += x4
-		x12 ~= x0
-		x12 = bits.rotate_left32(x12, 8)
-		x8 += x12
-		x4 ~= x8
-		x4 = bits.rotate_left32(x4, 7)
-
-		// quarterround(x, 1, 5, 9, 13)
-		x1 += x5
-		x13 ~= x1
-		x13 = bits.rotate_left32(x13, 16)
-		x9 += x13
-		x5 ~= x9
-		x5 = bits.rotate_left32(x5, 12)
-		x1 += x5
-		x13 ~= x1
-		x13 = bits.rotate_left32(x13, 8)
-		x9 += x13
-		x5 ~= x9
-		x5 = bits.rotate_left32(x5, 7)
-
-		// quarterround(x, 2, 6, 10, 14)
-		x2 += x6
-		x14 ~= x2
-		x14 = bits.rotate_left32(x14, 16)
-		x10 += x14
-		x6 ~= x10
-		x6 = bits.rotate_left32(x6, 12)
-		x2 += x6
-		x14 ~= x2
-		x14 = bits.rotate_left32(x14, 8)
-		x10 += x14
-		x6 ~= x10
-		x6 = bits.rotate_left32(x6, 7)
-
-		// quarterround(x, 3, 7, 11, 15)
-		x3 += x7
-		x15 ~= x3
-		x15 = bits.rotate_left32(x15, 16)
-		x11 += x15
-		x7 ~= x11
-		x7 = bits.rotate_left32(x7, 12)
-		x3 += x7
-		x15 ~= x3
-		x15 = bits.rotate_left32(x15, 8)
-		x11 += x15
-		x7 ~= x11
-		x7 = bits.rotate_left32(x7, 7)
-
-		// quarterround(x, 0, 5, 10, 15)
-		x0 += x5
-		x15 ~= x0
-		x15 = bits.rotate_left32(x15, 16)
-		x10 += x15
-		x5 ~= x10
-		x5 = bits.rotate_left32(x5, 12)
-		x0 += x5
-		x15 ~= x0
-		x15 = bits.rotate_left32(x15, 8)
-		x10 += x15
-		x5 ~= x10
-		x5 = bits.rotate_left32(x5, 7)
-
-		// quarterround(x, 1, 6, 11, 12)
-		x1 += x6
-		x12 ~= x1
-		x12 = bits.rotate_left32(x12, 16)
-		x11 += x12
-		x6 ~= x11
-		x6 = bits.rotate_left32(x6, 12)
-		x1 += x6
-		x12 ~= x1
-		x12 = bits.rotate_left32(x12, 8)
-		x11 += x12
-		x6 ~= x11
-		x6 = bits.rotate_left32(x6, 7)
-
-		// quarterround(x, 2, 7, 8, 13)
-		x2 += x7
-		x13 ~= x2
-		x13 = bits.rotate_left32(x13, 16)
-		x8 += x13
-		x7 ~= x8
-		x7 = bits.rotate_left32(x7, 12)
-		x2 += x7
-		x13 ~= x2
-		x13 = bits.rotate_left32(x13, 8)
-		x8 += x13
-		x7 ~= x8
-		x7 = bits.rotate_left32(x7, 7)
-
-		// quarterround(x, 3, 4, 9, 14)
-		x3 += x4
-		x14 ~= x3
-		x14 = bits.rotate_left32(x14, 16)
-		x9 += x14
-		x4 ~= x9
-		x4 = bits.rotate_left32(x4, 12)
-		x3 += x4
-		x14 ~= x3
-		x14 = bits.rotate_left32(x14, 8)
-		x9 += x14
-		x4 ~= x9
-		x4 = bits.rotate_left32(x4, 7)
-	}
-
-	endian.unchecked_put_u32le(dst[0:4], x0)
-	endian.unchecked_put_u32le(dst[4:8], x1)
-	endian.unchecked_put_u32le(dst[8:12], x2)
-	endian.unchecked_put_u32le(dst[12:16], x3)
-	endian.unchecked_put_u32le(dst[16:20], x12)
-	endian.unchecked_put_u32le(dst[20:24], x13)
-	endian.unchecked_put_u32le(dst[24:28], x14)
-	endian.unchecked_put_u32le(dst[28:32], x15)
+	_chacha20.reset(&ctx._state)
 }
@@ -0,0 +1,56 @@
+package chacha20
+
+import "base:intrinsics"
+import "core:crypto/_chacha20/ref"
+import "core:crypto/_chacha20/simd128"
+import "core:crypto/_chacha20/simd256"
+
+// DEFAULT_IMPLEMENTATION is the implementation that will be used by
+// default if possible.
+DEFAULT_IMPLEMENTATION :: Implementation.Simd256
+
+// Implementation is a ChaCha20 implementation.  Most callers will not need
+// to use this as the package will automatically select the most performant
+// implementation available.
+Implementation :: enum {
+	Portable,
+	Simd128,
+	Simd256,
+}
+
+@(private)
+init_impl :: proc(ctx: ^Context, impl: Implementation) {
+	impl := impl
+	if impl == .Simd256 && !simd256.is_performant() {
+			impl = .Simd128
+	}
+	if impl == .Simd128 && !simd128.is_performant() {
+		impl = .Portable
+	}
+
+	ctx._impl = impl
+}
+
+@(private)
+stream_blocks :: proc(ctx: ^Context, dst, src: []byte, nr_blocks: int) {
+	switch ctx._impl {
+	case .Simd256:
+		simd256.stream_blocks(&ctx._state, dst, src, nr_blocks)
+	case .Simd128:
+		simd128.stream_blocks(&ctx._state, dst, src, nr_blocks)
+	case .Portable:
+		ref.stream_blocks(&ctx._state, dst, src, nr_blocks)
+	}
+}
+
+@(private)
+hchacha20 :: proc "contextless" (dst, key, iv: []byte, impl: Implementation) {
+	switch impl {
+	case .Simd256:
+		simd256.hchacha20(dst, key, iv)
+	case .Simd128:
+		simd128.hchacha20(dst, key, iv)
+	case .Portable:
+		ref.hchacha20(dst, key, iv)
+	}
+}
@@ -1,9 +1,11 @@
 /*
-package chacha20poly1305 implements the AEAD_CHACHA20_POLY1305 Authenticated
-Encryption with Additional Data algorithm.
+package chacha20poly1305 implements the AEAD_CHACHA20_POLY1305 and
+AEAD_XChaCha20_Poly1305 Authenticated Encryption with Additional Data
+algorithms.

 See:
- https://www.rfc-editor.org/rfc/rfc8439
+- [[ https://www.rfc-editor.org/rfc/rfc8439 ]]
+- [[ https://datatracker.ietf.org/doc/html/draft-arciszewski-xchacha-03 ]]
 */
 package chacha20poly1305

@@ -15,8 +17,10 @@ import "core:mem"

 // KEY_SIZE is the chacha20poly1305 key size in bytes.
 KEY_SIZE :: chacha20.KEY_SIZE
-// NONCE_SIZE is the chacha20poly1305 nonce size in bytes.
-NONCE_SIZE :: chacha20.NONCE_SIZE
+// IV_SIZE is the chacha20poly1305 IV size in bytes.
+IV_SIZE :: chacha20.IV_SIZE
+// XIV_SIZE is the xchacha20poly1305 IV size in bytes.
+XIV_SIZE :: chacha20.XIV_SIZE
 // TAG_SIZE is the chacha20poly1305 tag size in bytes.
 TAG_SIZE :: poly1305.TAG_SIZE

@@ -24,15 +28,13 @@ TAG_SIZE :: poly1305.TAG_SIZE
 _P_MAX :: 64 * 0xffffffff // 64 * (2^32-1)

@(private)
-_validate_common_slice_sizes :: proc (tag, key, nonce, aad, text: []byte) {
+_validate_common_slice_sizes :: proc (tag, iv, aad, text: []byte, is_xchacha: bool) {
 	if len(tag) != TAG_SIZE {
 		panic("crypto/chacha20poly1305: invalid destination tag size")
 	}
-	if len(key) != KEY_SIZE {
-		panic("crypto/chacha20poly1305: invalid key size")
-	}
-	if len(nonce) != NONCE_SIZE {
-		panic("crypto/chacha20poly1305: invalid nonce size")
+	expected_iv_len := is_xchacha ? XIV_SIZE : IV_SIZE
+	if len(iv) != expected_iv_len {
+		panic("crypto/chacha20poly1305: invalid IV size")
 	}

 	#assert(size_of(int) == 8 || size_of(int) <= 4)
@@ -59,18 +61,52 @@ _update_mac_pad16 :: #force_inline proc (ctx: ^poly1305.Context, x_len: int) {
 	}
 }

-// encrypt encrypts the plaintext and authenticates the aad and ciphertext,
-// with the provided key and nonce, stores the output in ciphertext and tag.
-encrypt :: proc (ciphertext, tag, key, nonce, aad, plaintext: []byte) {
-	_validate_common_slice_sizes(tag, key, nonce, aad, plaintext)
+// Context is a keyed (X)Chacha20Poly1305 instance.
+Context :: struct {
+	_key:            [KEY_SIZE]byte,
+	_impl:           chacha20.Implementation,
+	_is_xchacha:     bool,
+	_is_initialized: bool,
+}
+
+// init initializes a Context with the provided key, for AEAD_CHACHA20_POLY1305.
+init :: proc(ctx: ^Context, key: []byte, impl := chacha20.DEFAULT_IMPLEMENTATION) {
+	if len(key) != KEY_SIZE {
+		panic("crypto/chacha20poly1305: invalid key size")
+	}
+
+	copy(ctx._key[:], key)
+	ctx._impl = impl
+	ctx._is_xchacha = false
+	ctx._is_initialized = true
+}
+
+// init_xchacha initializes a Context with the provided key, for
+// AEAD_XChaCha20_Poly1305.
+//
+// Note: While there are multiple definitions of XChaCha20-Poly1305
+// this sticks to the IETF draft and uses a 32-bit counter.
+init_xchacha :: proc(ctx: ^Context, key: []byte, impl := chacha20.DEFAULT_IMPLEMENTATION) {
+	init(ctx, key, impl)
+	ctx._is_xchacha = true
+}
+
+// seal encrypts the plaintext and authenticates the aad and ciphertext,
+// with the provided Context and iv, stores the output in dst and tag.
+//
+// dst and plaintext MUST alias exactly or not at all.
+seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {
+	ciphertext := dst
+	_validate_common_slice_sizes(tag, iv, aad, plaintext, ctx._is_xchacha)
 	if len(ciphertext) != len(plaintext) {
 		panic("crypto/chacha20poly1305: invalid destination ciphertext size")
 	}

 	stream_ctx: chacha20.Context = ---
-	chacha20.init(&stream_ctx, key, nonce)
+	chacha20.init(&stream_ctx, ctx._key[:],iv, ctx._impl)
+	stream_ctx._state._is_ietf_flavor = true

-	// otk = poly1305_key_gen(key, nonce)
+	// otk = poly1305_key_gen(key, iv)
 	otk: [poly1305.KEY_SIZE]byte = ---
 	chacha20.keystream_bytes(&stream_ctx, otk[:])
 	mac_ctx: poly1305.Context = ---
@@ -87,7 +123,7 @@ encrypt :: proc (ciphertext, tag, key, nonce, aad, plaintext: []byte) {
 	poly1305.update(&mac_ctx, aad)
 	_update_mac_pad16(&mac_ctx, aad_len)

-	// ciphertext = chacha20_encrypt(key, 1, nonce, plaintext)
+	// ciphertext = chacha20_encrypt(key, 1, iv, plaintext)
 	chacha20.seek(&stream_ctx, 1)
 	chacha20.xor_bytes(&stream_ctx, ciphertext, plaintext)
 	chacha20.reset(&stream_ctx) // Don't need the stream context anymore.
@@ -107,13 +143,16 @@ encrypt :: proc (ciphertext, tag, key, nonce, aad, plaintext: []byte) {
 	poly1305.final(&mac_ctx, tag) // Implicitly sanitizes context.
 }

-// decrypt authenticates the aad and ciphertext, and decrypts the ciphertext,
-// with the provided key, nonce, and tag, and stores the output in plaintext,
-// returning true iff the authentication was successful.
+// open authenticates the aad and ciphertext, and decrypts the ciphertext,
+// with the provided Context, iv, and tag, and stores the output in dst,
+// returning true iff the authentication was successful.  If authentication
+// fails, the destination buffer will be zeroed.
 //
-// If authentication fails, the destination plaintext buffer will be zeroed.
-decrypt :: proc (plaintext, tag, key, nonce, aad, ciphertext: []byte) -> bool {
-	_validate_common_slice_sizes(tag, key, nonce, aad, ciphertext)
+// dst and plaintext MUST alias exactly or not at all.
+@(require_results)
+open :: proc(ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool {
+	plaintext := dst
+	_validate_common_slice_sizes(tag, iv, aad, ciphertext, ctx._is_xchacha)
 	if len(ciphertext) != len(plaintext) {
 		panic("crypto/chacha20poly1305: invalid destination plaintext size")
 	}
@@ -123,9 +162,10 @@ decrypt :: proc (plaintext, tag, key, nonce, aad, ciphertext: []byte) -> bool {
 	// points where needed.

 	stream_ctx: chacha20.Context = ---
-	chacha20.init(&stream_ctx, key, nonce)
+	chacha20.init(&stream_ctx, ctx._key[:], iv, ctx._impl)
+	stream_ctx._state._is_ietf_flavor = true

-	// otk = poly1305_key_gen(key, nonce)
+	// otk = poly1305_key_gen(key, iv)
 	otk: [poly1305.KEY_SIZE]byte = ---
 	chacha20.keystream_bytes(&stream_ctx, otk[:])
 	defer chacha20.reset(&stream_ctx)
@@ -160,9 +200,17 @@ decrypt :: proc (plaintext, tag, key, nonce, aad, ciphertext: []byte) -> bool {
 		return false
 	}

-	// plaintext = chacha20_decrypt(key, 1, nonce, ciphertext)
+	// plaintext = chacha20_decrypt(key, 1, iv, ciphertext)
 	chacha20.seek(&stream_ctx, 1)
 	chacha20.xor_bytes(&stream_ctx, plaintext, ciphertext)

 	return true
 }
+
+// reset sanitizes the Context.  The Context must be
+// re-initialized to be used again.
+reset :: proc "contextless" (ctx: ^Context) {
+	mem.zero_explicit(&ctx._key, len(ctx._key))
+	ctx._is_xchacha = false
+	ctx._is_initialized = false
+}
@@ -2,9 +2,9 @@
 package ed25519 implements the Ed25519 EdDSA signature algorithm.

 See:
- https://datatracker.ietf.org/doc/html/rfc8032
- https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.186-5.pdf
- https://eprint.iacr.org/2020/1244.pdf
+- [[ https://datatracker.ietf.org/doc/html/rfc8032 ]]
+- [[ https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.186-5.pdf ]]
+- [[ https://eprint.iacr.org/2020/1244.pdf ]]
 */
 package ed25519

@@ -21,7 +21,7 @@ PUBLIC_KEY_SIZE :: 32
 SIGNATURE_SIZE :: 64

@(private)
-NONCE_SIZE :: 32
+HDIGEST2_SIZE :: 32

 // Private_Key is an Ed25519 private key.
 Private_Key :: struct {
@@ -33,7 +33,7 @@ Private_Key :: struct {
 	// See: https://github.com/MystenLabs/ed25519-unsafe-libs
 	_b:              [PRIVATE_KEY_SIZE]byte,
 	_s:              grp.Scalar,
-	_nonce:          [NONCE_SIZE]byte,
+	_hdigest2:       [HDIGEST2_SIZE]byte,
 	_pub_key:        Public_Key,
 	_is_initialized: bool,
 }
@@ -63,7 +63,7 @@ private_key_set_bytes :: proc(priv_key: ^Private_Key, b: []byte) -> bool {
 	sha2.final(&ctx, h_bytes[:])

 	copy(priv_key._b[:], b)
-	copy(priv_key._nonce[:], h_bytes[32:])
+	copy(priv_key._hdigest2[:], h_bytes[32:])
 	grp.sc_set_bytes_rfc8032(&priv_key._s, h_bytes[:32])

 	// Derive the corresponding public key.
@@ -116,7 +116,7 @@ sign :: proc(priv_key: ^Private_Key, msg, sig: []byte) {
 	ctx: sha2.Context_512 = ---
 	digest_bytes: [sha2.DIGEST_SIZE_512]byte = ---
 	sha2.init_512(&ctx)
-	sha2.update(&ctx, priv_key._nonce[:])
+	sha2.update(&ctx, priv_key._hdigest2[:])
 	sha2.update(&ctx, msg)
 	sha2.final(&ctx, digest_bytes[:])

@@ -17,46 +17,44 @@ accomplish common tasks.
  A third optional boolean parameter controls if the file is streamed
  (default), or or read at once.

-```odin
-package hash_example
+Example:
+	package hash_example

-import "core:crypto/hash"
+	import "core:crypto/hash"

-main :: proc() {
-	input := "Feed the fire."
+	main :: proc() {
+		input := "Feed the fire."

-	// Compute the digest, using the high level API.
-	returned_digest := hash.hash(hash.Algorithm.SHA512_256, input)
-	defer delete(returned_digest)
+		// Compute the digest, using the high level API.
+		returned_digest := hash.hash(hash.Algorithm.SHA512_256, input)
+		defer delete(returned_digest)

-	// Variant that takes a destination buffer, instead of returning
-	// the digest.
-	digest := make([]byte, hash.DIGEST_SIZES[hash.Algorithm.BLAKE2B]) // @note: Destination buffer has to be at least as big as the digest size of the hash.
-	defer delete(digest)
-	hash.hash(hash.Algorithm.BLAKE2B, input, digest)
-}
-```
+		// Variant that takes a destination buffer, instead of returning
+		// the digest.
+		digest := make([]byte, hash.DIGEST_SIZES[hash.Algorithm.BLAKE2B]) // @note: Destination buffer has to be at least as big as the digest size of the hash.
+		defer delete(digest)
+		hash.hash(hash.Algorithm.BLAKE2B, input, digest)
+	}

 A generic low level API is provided supporting the init/update/final interface
 that is typical with cryptographic hash function implementations.

-```odin
-package hash_example
+Example:
+	package hash_example

-import "core:crypto/hash"
+	import "core:crypto/hash"

-main :: proc() {
-    input := "Let the cinders burn."
+	main :: proc() {
+		input := "Let the cinders burn."

-    // Compute the digest, using the low level API.
-    ctx: hash.Context
-    digest := make([]byte, hash.DIGEST_SIZES[hash.Algorithm.SHA3_512])
-    defer delete(digest)
+		// Compute the digest, using the low level API.
+		ctx: hash.Context
+		digest := make([]byte, hash.DIGEST_SIZES[hash.Algorithm.SHA3_512])
+		defer delete(digest)

-    hash.init(&ctx, hash.Algorithm.SHA3_512)
-    hash.update(&ctx, transmute([]byte)input)
-    hash.final(&ctx, digest)
-}
-```
+		hash.init(&ctx, hash.Algorithm.SHA3_512)
+		hash.update(&ctx, transmute([]byte)input)
+		hash.final(&ctx, digest)
+	}
 */
-package crypto_hash
+package crypto_hash
@@ -28,20 +28,26 @@ hash_bytes :: proc(algorithm: Algorithm, data: []byte, allocator := context.allo

 // hash_string_to_buffer will hash the given input and assign the
 // computed digest to the third parameter.  It requires that the
-// destination buffer is at least as big as the digest size.
-hash_string_to_buffer :: proc(algorithm: Algorithm, data: string, hash: []byte) {
-	hash_bytes_to_buffer(algorithm, transmute([]byte)(data), hash)
+// destination buffer is at least as big as the digest size.  The
+// provided destination buffer is returned to match the behavior of
+// `hash_string`.
+hash_string_to_buffer :: proc(algorithm: Algorithm, data: string, hash: []byte) -> []byte {
+	return hash_bytes_to_buffer(algorithm, transmute([]byte)(data), hash)
 }

 // hash_bytes_to_buffer will hash the given input and write the
 // computed digest into the third parameter.  It requires that the
-// destination buffer is at least as big as the digest size.
-hash_bytes_to_buffer :: proc(algorithm: Algorithm, data, hash: []byte) {
+// destination buffer is at least as big as the digest size.  The
+// provided destination buffer is returned to match the behavior of
+// `hash_bytes`.
+hash_bytes_to_buffer :: proc(algorithm: Algorithm, data, hash: []byte) -> []byte {
 	ctx: Context

 	init(&ctx, algorithm)
 	update(&ctx, data)
 	final(&ctx, hash)
+
+	return hash
 }

 // hash_stream will incrementally fully consume a stream, and return the
@@ -2,7 +2,7 @@
 package hkdf implements the HKDF HMAC-based Extract-and-Expand Key
 Derivation Function.

-See: https://www.rfc-editor.org/rfc/rfc5869
+See: [[ https://www.rfc-editor.org/rfc/rfc5869 ]]
 */
 package hkdf

@@ -2,7 +2,7 @@
 package hmac implements the HMAC MAC algorithm.

 See:
- https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.198-1.pdf
+- [[ https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.198-1.pdf ]]
 */
 package hmac

@@ -2,7 +2,7 @@
 package kmac implements the KMAC MAC algorithm.

 See:
- https://nvlpubs.nist.gov/nistpubs/specialpublications/nist.sp.800-185.pdf
+- [[ https://nvlpubs.nist.gov/nistpubs/specialpublications/nist.sp.800-185.pdf ]]
 */
 package kmac

@@ -5,8 +5,8 @@ WARNING: The MD5 algorithm is known to be insecure and should only be
 used for interoperating with legacy applications.

 See:
- https://eprint.iacr.org/2005/075
- https://datatracker.ietf.org/doc/html/rfc1321
+- [[ https://eprint.iacr.org/2005/075 ]]
+- [[ https://datatracker.ietf.org/doc/html/rfc1321 ]]
 */
 package md5

@@ -5,9 +5,9 @@ WARNING: The SHA1 algorithm is known to be insecure and should only be
 used for interoperating with legacy applications.

 See:
- https://eprint.iacr.org/2017/190
- https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf
- https://datatracker.ietf.org/doc/html/rfc3174
+- [[ https://eprint.iacr.org/2017/190 ]]
+- [[ https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf ]]
+- [[ https://datatracker.ietf.org/doc/html/rfc3174 ]]
 */
 package sha1

@@ -1,7 +1,7 @@
 /*
 package pbkdf2 implements the PBKDF2 password-based key derivation function.

-See: https://www.rfc-editor.org/rfc/rfc2898
+See: [[ https://www.rfc-editor.org/rfc/rfc2898 ]]
 */
 package pbkdf2

@@ -2,7 +2,7 @@
 package poly1305 implements the Poly1305 one-time MAC algorithm.

 See:
- https://datatracker.ietf.org/doc/html/rfc8439
+- [[ https://datatracker.ietf.org/doc/html/rfc8439 ]]
 */
 package poly1305

@@ -2,7 +2,7 @@
 package ristretto255 implement the ristretto255 prime-order group.

 See:
- https://www.rfc-editor.org/rfc/rfc9496
+- [[ https://www.rfc-editor.org/rfc/rfc9496 ]]
 */
 package ristretto255

@@ -2,8 +2,8 @@
 package sha2 implements the SHA2 hash algorithm family.

 See:
- https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf
- https://datatracker.ietf.org/doc/html/rfc3874
+- [[ https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf ]]
+- [[ https://datatracker.ietf.org/doc/html/rfc3874 ]]
 */
 package sha2

@@ -6,7 +6,7 @@ pre-standardization Keccak algorithm is required, it can be found in
 crypto/legacy/keccak.

 See:
- https://nvlpubs.nist.gov/nistpubs/fips/nist.fips.202.pdf
+- [[ https://nvlpubs.nist.gov/nistpubs/fips/nist.fips.202.pdf ]]
 */
 package sha3

@@ -4,8 +4,8 @@ package shake implements the SHAKE and cSHAKE XOF algorithm families.
 The SHA3 hash algorithm can be found in the crypto/sha3.

 See:
- https://nvlpubs.nist.gov/nistpubs/fips/nist.fips.202.pdf
- https://nvlpubs.nist.gov/nistpubs/specialpublications/nist.sp.800-185.pdf
+- [[ https://nvlpubs.nist.gov/nistpubs/fips/nist.fips.202.pdf ]]
+- [[ https://nvlpubs.nist.gov/nistpubs/specialpublications/nist.sp.800-185.pdf ]]
 */
 package shake

@@ -1,3 +1,12 @@
+/*
+package siphash Implements the SipHash hashing algorithm.
+
+Use the specific procedures for a certain setup. The generic procedures will default to Siphash 2-4.
+
+See:
+- [[ https://github.com/veorq/SipHash ]]
+- [[ https://www.aumasson.jp/siphash/siphash.pdf ]]
+*/
 package siphash

 /*
@@ -6,10 +15,6 @@ package siphash

    List of contributors:
        zhibog:  Initial implementation.
-
-    Implementation of the SipHash hashing algorithm, as defined at <https://github.com/veorq/SipHash> and <https://www.aumasson.jp/siphash/siphash.pdf>
-
-    Use the specific procedures for a certain setup. The generic procdedures will default to Siphash 2-4
 */

 import "core:crypto"
@@ -2,7 +2,7 @@
 package sm3 implements the SM3 hash algorithm.

 See:
- https://datatracker.ietf.org/doc/html/draft-sca-cfrg-sm3-02
+- [[ https://datatracker.ietf.org/doc/html/draft-sca-cfrg-sm3-02 ]]
 */
 package sm3

@@ -2,7 +2,7 @@
 package tuplehash implements the TupleHash and TupleHashXOF algorithms.

 See:
- https://nvlpubs.nist.gov/nistpubs/specialpublications/nist.sp.800-185.pdf
+- [[ https://nvlpubs.nist.gov/nistpubs/specialpublications/nist.sp.800-185.pdf ]]
 */
 package tuplehash

@@ -3,7 +3,7 @@ package x25519 implements the X25519 (aka curve25519) Elliptic-Curve
 Diffie-Hellman key exchange protocol.

 See:
- https://www.rfc-editor.org/rfc/rfc7748
+- [[ https://www.rfc-editor.org/rfc/rfc7748 ]]
 */
 package x25519

@@ -1,4 +1,6 @@
-//+build !windows !linux !darwin
+//+build !windows
+//+build !linux
+//+build !darwin
 package debug_trace

 import "base:runtime"
@@ -4,7 +4,6 @@ Package `core:dynlib` implements loading of shared libraries/DLLs and their symb
 The behaviour of dynamically loaded libraries is specific to the target platform of the program.
 For in depth detail on the underlying behaviour please refer to your target platform's documentation.

-See `example` directory for an example library exporting 3 symbols and a host program loading them automatically
-by defining a symbol table struct.
+For a full example, see: [[ core/dynlib/example; https://github.com/odin-lang/Odin/tree/master/core/dynlib/example ]]
 */
 package dynlib
@@ -13,8 +13,8 @@ If your terminal supports 24-bit true color mode, you can also do this:
 	fmt.println(ansi.CSI + ansi.FG_COLOR_24_BIT + ";0;255;255" + ansi.SGR + "Hellope!" + ansi.CSI + ansi.RESET + ansi.SGR)

 For more information, see:
-	1. https://en.wikipedia.org/wiki/ANSI_escape_code
-	2. https://www.vt100.net/docs/vt102-ug/chapter5.html
-	3. https://invisible-island.net/xterm/ctlseqs/ctlseqs.html
+- [[ https://en.wikipedia.org/wiki/ANSI_escape_code ]]
+- [[ https://www.vt100.net/docs/vt102-ug/chapter5.html ]]
+- [[ https://invisible-island.net/xterm/ctlseqs/ctlseqs.html ]]
 */
 package ansi
@@ -3,6 +3,7 @@ package encoding_cbor
 import "base:intrinsics"

 import "core:encoding/json"
+import "core:encoding/hex"
 import "core:io"
 import "core:mem"
 import "core:strconv"
@@ -399,11 +400,11 @@ to_diagnostic_format_writer :: proc(w: io.Writer, val: Value, padding := 0) -> i
 		io.write_string(w, str) or_return

 	case bool: io.write_string(w, "true" if v else "false") or_return
-	case Nil: io.write_string(w, "nil") or_return
+	case Nil: io.write_string(w, "null") or_return
 	case Undefined: io.write_string(w, "undefined") or_return
 	case ^Bytes:
 		io.write_string(w, "h'") or_return
-		for b in v { io.write_int(w, int(b), 16) or_return }
+		hex.encode_into_writer(w, v^) or_return
 		io.write_string(w, "'") or_return
 	case ^Text:
 		io.write_string(w, `"`) or_return
@@ -481,9 +481,7 @@ _marshal_into_encoder :: proc(e: Encoder, v: any, ti: ^runtime.Type_Info) -> (er
 			}
 		}

-		marshal_entry :: #force_inline proc(e: Encoder, info: runtime.Type_Info_Struct, v: any, name: string, i: int) -> Marshal_Error {
-			err_conv(_encode_text(e, name)) or_return
-
+		marshal_entry :: #force_inline proc(e: Encoder, info: runtime.Type_Info_Struct, v: any, i: int) -> Marshal_Error {
 			id := info.types[i].id
 			data := rawptr(uintptr(v.data) + info.offsets[i])
 			field_any := any{data, id}
@@ -517,7 +515,7 @@ _marshal_into_encoder :: proc(e: Encoder, v: any, ti: ^runtime.Type_Info) -> (er

 		if .Deterministic_Map_Sorting in e.flags {
 			Name :: struct {
-				name:  string,
+				name:  []byte,
 				field: int,
 			}
 			entries := make([dynamic]Name, 0, n, e.temp_allocator) or_return
@@ -529,16 +527,19 @@ _marshal_into_encoder :: proc(e: Encoder, v: any, ti: ^runtime.Type_Info) -> (er
 					continue
 				}

-				append(&entries, Name{fname, i}) or_return
+				key_builder := strings.builder_make(e.temp_allocator) or_return
+				err_conv(_encode_text(Encoder{e.flags, strings.to_stream(&key_builder), e.temp_allocator}, fname)) or_return
+				append(&entries, Name{key_builder.buf[:], i}) or_return
 			}

 			// Sort lexicographic on the bytes of the key.
 			slice.sort_by_cmp(entries[:], proc(a, b: Name) -> slice.Ordering {
-				return slice.Ordering(bytes.compare(transmute([]byte)a.name, transmute([]byte)b.name))
+				return slice.Ordering(bytes.compare(a.name, b.name))
 			})

 			for entry in entries {
-				marshal_entry(e, info, v, entry.name, entry.field) or_return
+				io.write_full(e.writer, entry.name) or_return
+				marshal_entry(e, info, v, entry.field) or_return
 			}
 		} else {
 			for _, i in info.names[:info.field_count] {
@@ -547,7 +548,8 @@ _marshal_into_encoder :: proc(e: Encoder, v: any, ti: ^runtime.Type_Info) -> (er
 					continue
 				}

-				marshal_entry(e, info, v, fname, i) or_return
+				err_conv(_encode_text(e, fname)) or_return
+				marshal_entry(e, info, v, i) or_return
 			}
 		}
 		return
@@ -0,0 +1,96 @@
+/*
+package csv reads and writes comma-separated values (CSV) files.
+This package supports the format described in [[ RFC 4180; https://tools.ietf.org/html/rfc4180.html ]]
+
+Example:
+	package main
+
+	import "core:fmt"
+	import "core:encoding/csv"
+	import "core:os"
+
+	// Requires keeping the entire CSV file in memory at once
+	iterate_csv_from_string :: proc(filename: string) {
+		r: csv.Reader
+		r.trim_leading_space  = true
+		r.reuse_record        = true // Without it you have to delete(record)
+		r.reuse_record_buffer = true // Without it you have to each of the fields within it
+		defer csv.reader_destroy(&r)
+
+		csv_data, ok := os.read_entire_file(filename)
+		if ok {
+			csv.reader_init_with_string(&r, string(csv_data))
+		} else {
+			fmt.printfln("Unable to open file: %v", filename)
+			return
+		}
+		defer delete(csv_data)
+
+		for r, i, err in csv.iterator_next(&r) {
+			if err != nil { /* Do something with error */ }
+			for f, j in r {
+				fmt.printfln("Record %v, field %v: %q", i, j, f)
+			}
+		}
+	}
+
+	// Reads the CSV as it's processed (with a small buffer)
+	iterate_csv_from_stream :: proc(filename: string) {
+		fmt.printfln("Hellope from %v", filename)
+		r: csv.Reader
+		r.trim_leading_space  = true
+		r.reuse_record        = true // Without it you have to delete(record)
+		r.reuse_record_buffer = true // Without it you have to each of the fields within it
+		defer csv.reader_destroy(&r)
+
+		handle, err := os.open(filename)
+		if err != nil {
+			fmt.eprintfln("Error opening file: %v", filename)
+			return
+		}
+		defer os.close(handle)
+		csv.reader_init(&r, os.stream_from_handle(handle))
+
+		for r, i in csv.iterator_next(&r) {
+			for f, j in r {
+				fmt.printfln("Record %v, field %v: %q", i, j, f)
+			}
+		}
+		fmt.printfln("Error: %v", csv.iterator_last_error(r))
+	}
+
+	// Read all records at once
+	read_csv_from_string :: proc(filename: string) {
+		r: csv.Reader
+		r.trim_leading_space  = true
+		r.reuse_record        = true // Without it you have to delete(record)
+		r.reuse_record_buffer = true // Without it you have to each of the fields within it
+		defer csv.reader_destroy(&r)
+
+		csv_data, ok := os.read_entire_file(filename)
+		if ok {
+			csv.reader_init_with_string(&r, string(csv_data))
+		} else {
+			fmt.printfln("Unable to open file: %v", filename)
+			return
+		}
+		defer delete(csv_data)
+
+		records, err := csv.read_all(&r)
+		if err != nil { /* Do something with CSV parse error */ }
+
+		defer {
+			for rec in records {
+				delete(rec)
+			}
+			delete(records)
+		}
+
+		for r, i in records {
+			for f, j in r {
+				fmt.printfln("Record %v, field %v: %q", i, j, f)
+			}
+		}
+	}
+*/
+package encoding_csv
@@ -1,88 +0,0 @@
-//+build ignore
-package encoding_csv
-
-import "core:fmt"
-import "core:encoding/csv"
-import "core:os"
-
-// Requires keeping the entire CSV file in memory at once
-iterate_csv_from_string :: proc(filename: string) {
-	r: csv.Reader
-	r.trim_leading_space  = true
-	r.reuse_record        = true // Without it you have to delete(record)
-	r.reuse_record_buffer = true // Without it you have to each of the fields within it
-	defer csv.reader_destroy(&r)
-
-	if csv_data, ok := os.read_entire_file(filename); ok {
-		csv.reader_init_with_string(&r, string(csv_data))
-		defer delete(csv_data)
-	} else {
-		fmt.printfln("Unable to open file: %v", filename)
-		return
-	}
-
-	for r, i, err in csv.iterator_next(&r) {
-		if err != nil { /* Do something with error */ }
-		for f, j in r {
-			fmt.printfln("Record %v, field %v: %q", i, j, f)
-		}
-	}
-}
-
-// Reads the CSV as it's processed (with a small buffer)
-iterate_csv_from_stream :: proc(filename: string) {
-	fmt.printfln("Hellope from %v", filename)
-	r: csv.Reader
-	r.trim_leading_space  = true
-	r.reuse_record        = true // Without it you have to delete(record)
-	r.reuse_record_buffer = true // Without it you have to each of the fields within it
-	defer csv.reader_destroy(&r)
-
-	handle, err := os.open(filename)
-	if err != nil {
-		fmt.eprintfln("Error opening file: %v", filename)
-		return
-	}
-	defer os.close(handle)
-	csv.reader_init(&r, os.stream_from_handle(handle))
-
-	for r, i in csv.iterator_next(&r) {
-		for f, j in r {
-			fmt.printfln("Record %v, field %v: %q", i, j, f)
-		}
-	}
-	fmt.printfln("Error: %v", csv.iterator_last_error(r))
-}
-
-// Read all records at once
-read_csv_from_string :: proc(filename: string) {
-	r: csv.Reader
-	r.trim_leading_space  = true
-	r.reuse_record        = true // Without it you have to delete(record)
-	r.reuse_record_buffer = true // Without it you have to each of the fields within it
-	defer csv.reader_destroy(&r)
-
-	if csv_data, ok := os.read_entire_file(filename); ok {
-		csv.reader_init_with_string(&r, string(csv_data))
-		defer delete(csv_data)
-	} else {
-		fmt.printfln("Unable to open file: %v", filename)
-		return
-	}
-
-	records, err := csv.read_all(&r)
-	if err != nil { /* Do something with CSV parse error */ }
-
-	defer {
-		for rec in records {
-			delete(rec)
-		}
-		delete(records)
-	}
-
-	for r, i in records {
-		for f, j in r {
-			fmt.printfln("Record %v, field %v: %q", i, j, f)
-		}
-	}
-}
@@ -1,5 +1,5 @@
 // package csv reads and writes comma-separated values (CSV) files.
-// This package supports the format described in RFC 4180 <https://tools.ietf.org/html/rfc4180.html>
+// This package supports the format described in [[ RFC 4180; https://tools.ietf.org/html/rfc4180.html ]]
 package encoding_csv

 import "core:bufio"
@@ -484,4 +484,4 @@ _read_record :: proc(r: ^Reader, dst: ^[dynamic]string, allocator := context.all
 		r.fields_per_record = len(dst)
 	}
 	return dst[:], err
-}
+}
@@ -2,22 +2,23 @@
    Package endian implements a simple translation between bytes and numbers with
    specific endian encodings.

-    buf: [100]u8
-    put_u16(buf[:], .Little, 16) or_return
+Example:
+	buf: [100]u8
+	put_u16(buf[:], .Little, 16) or_return

-    You may ask yourself, why isn't `byte_order` platform Endianness by default, so we can write:
-    put_u16(buf[:], 16) or_return
+	// You may ask yourself, why isn't `byte_order` platform Endianness by default, so we can write:
+	put_u16(buf[:], 16) or_return

-    The answer is that very few file formats are written in native/platform endianness. Most of them specify the endianness of
-    each of their fields, or use a header field which specifies it for the entire file.
+	// The answer is that very few file formats are written in native/platform endianness. Most of them specify the endianness of
+	// each of their fields, or use a header field which specifies it for the entire file.

-    e.g. a file which specifies it at the top for all fields could do this:
-    file_order := .Little if buf[0] == 0 else .Big
-    field := get_u16(buf[1:], file_order) or_return
+	// e.g. a file which specifies it at the top for all fields could do this:
+	file_order := .Little if buf[0] == 0 else .Big
+	field := get_u16(buf[1:], file_order) or_return

-    If on the other hand a field is *always* Big-Endian, you're wise to explicitly state it for the benefit of the reader,
-    be that your future self or someone else.
+	// If on the other hand a field is *always* Big-Endian, you're wise to explicitly state it for the benefit of the reader,
+	// be that your future self or someone else.

-    field := get_u16(buf[:], .Big) or_return
+	field := get_u16(buf[:], .Big) or_return
 */
 package encoding_endian
@@ -1,24 +1,26 @@
-package encoding_unicode_entity
 /*
-	A unicode entity encoder/decoder
-
 	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.

-	This code has several procedures to map unicode runes to/from different textual encodings.
-	- SGML/XML/HTML entity
-	-- &#<decimal>;
-	-- &#x<hexadecimal>;
-	-- &<entity name>;   (If the lookup tables are compiled in).
-	Reference: https://www.w3.org/2003/entities/2007xml/unicode.xml	
-
-	- URL encode / decode %hex entity
-	Reference: https://datatracker.ietf.org/doc/html/rfc3986/#section-2.1
-
 	List of contributors:
 		Jeroen van Rijn: Initial implementation.
 */

+/*
+	A unicode entity encoder/decoder.
+
+	This code has several procedures to map unicode runes to/from different textual encodings.
+	- SGML/XML/HTML entity
+	- &#<decimal>;
+	- &#x<hexadecimal>;
+	- &<entity name>;   (If the lookup tables are compiled in).
+	Reference: [[ https://www.w3.org/2003/entities/2007xml/unicode.xml ]]
+
+	- URL encode / decode %hex entity
+	Reference: [[ https://datatracker.ietf.org/doc/html/rfc3986/#section-2.1 ]]
+*/
+package encoding_unicode_entity
+
 import "core:unicode/utf8"
 import "core:unicode"
 import "core:strings"
@@ -353,4 +355,4 @@ _handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: X

 	}
 	return false, .None
-}
+}
@@ -42,7 +42,7 @@ XML_NAME_TO_RUNE_MAX_LENGTH :: 31
 	Input:
 		entity_name - a string, like "copy" that describes a user-encoded Unicode entity as used in XML.

-	Output:
+	Returns:
 		"decoded" - The decoded rune if found by name, or -1 otherwise.
 		"ok"      - true if found, false if not.

@@ -1,5 +1,6 @@
 package encoding_hex

+import "core:io"
 import "core:strings"

 encode :: proc(src: []byte, allocator := context.allocator, loc := #caller_location) -> []byte #no_bounds_check {
@@ -14,6 +15,12 @@ encode :: proc(src: []byte, allocator := context.allocator, loc := #caller_locat
 	return dst
 }

+encode_into_writer :: proc(dst: io.Writer, src: []byte) -> io.Error {
+	for v in src {
+		io.write(dst, {HEXTABLE[v>>4], HEXTABLE[v&0x0f]}) or_return
+	}
+	return nil
+}

 decode :: proc(src: []byte, allocator := context.allocator, loc := #caller_location) -> (dst: []byte, ok: bool) #no_bounds_check {
 	if len(src) % 2 == 1 {
@@ -1,83 +1,89 @@
-// Implementation of the HxA 3D asset format
-// HxA is a interchangeable graphics asset format.
-// Designed by Eskil Steenberg. @quelsolaar / eskil 'at' obsession 'dot' se / www.quelsolaar.com
-//
-// Author of this Odin package: Ginger Bill
-//
-// Following comment is copied from the original C-implementation
-// ---------
-// -Does the world need another Graphics file format?
-// 	Unfortunately, Yes. All existing formats are either too large and complicated to be implemented from
-// 	scratch, or don't have some basic features needed in modern computer graphics.
-// -Who is this format for?
-// 	For people who want a capable open Graphics format that can be implemented from scratch in
-// 	a few hours. It is ideal for graphics researchers, game developers or other people who
-// 	wants to build custom graphics pipelines. Given how easy it is to parse and write, it
-// 	should be easy to write utilities that process assets to preform tasks like: generating
-// 	normals, light-maps, tangent spaces, Error detection, GPU optimization, LOD generation,
-// 	and UV mapping.
-// -Why store images in the format when there are so many good image formats already?
-// 	Yes there are, but only for 2D RGB/RGBA images. A lot of computer graphics rendering rely
-// 	on 1D, 3D, cube, multilayer, multi channel, floating point bitmap buffers. There almost no
-// 	formats for this kind of data. Also 3D files that reference separate image files rely on
-// 	file paths, and this often creates issues when the assets are moved. By including the
-// 	texture data in the files directly the assets become self contained.
-// -Why doesn't the format support <insert whatever>?
-// 	Because the entire point is to make a format that can be implemented. Features like NURBSs,
-// 	Construction history, or BSP trees would make the format too large to serve its purpose.
-// 	The facilities of the formats to store meta data should make the format flexible enough
-// 	for most uses. Adding HxA support should be something anyone can do in a days work.
-//
-// Structure:
-// ----------
-// HxA is designed to be extremely simple to parse, and is therefore based around conventions. It has
-// a few basic structures, and depending on how they are used they mean different things. This means
-// that you can implement a tool that loads the entire file, modifies the parts it cares about and
-// leaves the rest intact. It is also possible to write a tool that makes all data in the file
-// editable without the need to understand its use. It is also possible for anyone to use the format
-// to store data axillary data. Anyone who wants to store data not covered by a convention can submit
-// a convention to extend the format. There should never be a convention for storing the same data in
-// two differed ways.
-// The data is story in a number of nodes that are stored in an array. Each node stores an array of
-// meta data. Meta data can describe anything you want, and a lot of conventions will use meta data
-// to store additional information, for things like transforms, lights, shaders and animation.
-// Data for Vertices, Corners, Faces, and Pixels are stored in named layer stacks. Each stack consists
-// of a number of named layers. All layers in the stack have the same number of elements. Each layer
-// describes one property of the primitive. Each layer can have multiple channels and each layer can
-// store data of a different type.
-//
-// HaX stores 3 kinds of nodes
-// 	- Pixel data.
-// 	- Polygon geometry data.
-// 	- Meta data only.
-//
-// Pixel Nodes stores pixels in a layer stack. A layer may store things like Albedo, Roughness,
-// Reflectance, Light maps, Masks, Normal maps, and Displacement. Layers use the channels of the
-// layers to store things like color. The length of the layer stack is determined by the type and
-// dimensions stored in the
-//
-// Geometry data is stored in 3 separate layer stacks for: vertex data, corner data and face data. The
-// vertex data stores things like verities, blend shapes, weight maps, and vertex colors. The first
-// layer in a vertex stack has to be a 3 channel layer named "position" describing the base position
-// of the vertices. The corner stack describes data per corner or edge of the polygons. It can be used
-// for things like UV, normals, and adjacency. The first layer in a corner stack has to be a 1 channel
-// integer layer named "index" describing the vertices used to form polygons. The last value in each
-// polygon has a negative - 1 index to indicate the end of the polygon.
-//
-// Example:
-// 	A quad and a tri with the vertex index:
-// 		[0, 1, 2, 3] [1, 4, 2]
-// 	is stored:
-// 		[0, 1, 2, -4, 1, 4, -3]
-// The face stack stores values per face. the length of the face stack has to match the number of
-// negative values in the index layer in the corner stack. The face stack can be used to store things
-// like material index.
-//
-// Storage
-// -------
-// All data is stored in little endian byte order with no padding. The layout mirrors the structs
-// defined below with a few exceptions. All names are stored as a 8-bit unsigned integer indicating
-// the length of the name followed by that many characters. Termination is not stored in the file.
-// Text strings stored in meta data are stored the same way as names, but instead of a 8-bit unsigned
-// integer a 32-bit unsigned integer is used.
-package encoding_hxa
+/*
+Implementation of the HxA 3D asset format
+HxA is a interchangeable graphics asset format.
+Designed by Eskil Steenberg. @quelsolaar / eskil 'at' obsession 'dot' se / www.quelsolaar.com
+
+Author of this Odin package: Ginger Bill
+
+Following comment is copied from the original C-implementation  
+---------  
+- Does the world need another Graphics file format?  
+Unfortunately, Yes. All existing formats are either too large and complicated to be implemented from
+scratch, or don't have some basic features needed in modern computer graphics.
+
+- Who is this format for?  
+For people who want a capable open Graphics format that can be implemented from scratch in
+a few hours. It is ideal for graphics researchers, game developers or other people who
+wants to build custom graphics pipelines. Given how easy it is to parse and write, it
+should be easy to write utilities that process assets to preform tasks like: generating
+normals, light-maps, tangent spaces, Error detection, GPU optimization, LOD generation,
+and UV mapping.
+
+- Why store images in the format when there are so many good image formats already?  
+Yes there are, but only for 2D RGB/RGBA images. A lot of computer graphics rendering rely
+on 1D, 3D, cube, multilayer, multi channel, floating point bitmap buffers. There almost no
+formats for this kind of data. Also 3D files that reference separate image files rely on
+file paths, and this often creates issues when the assets are moved. By including the
+texture data in the files directly the assets become self contained.
+
+- Why doesn't the format support <insert whatever>?  
+Because the entire point is to make a format that can be implemented. Features like NURBSs,
+Construction history, or BSP trees would make the format too large to serve its purpose.
+The facilities of the formats to store meta data should make the format flexible enough
+for most uses. Adding HxA support should be something anyone can do in a days work.
+
+Structure:  
+----------  
+HxA is designed to be extremely simple to parse, and is therefore based around conventions. It has
+a few basic structures, and depending on how they are used they mean different things. This means
+that you can implement a tool that loads the entire file, modifies the parts it cares about and
+leaves the rest intact. It is also possible to write a tool that makes all data in the file
+editable without the need to understand its use. It is also possible for anyone to use the format
+to store data axillary data. Anyone who wants to store data not covered by a convention can submit
+a convention to extend the format. There should never be a convention for storing the same data in
+two differed ways.
+
+The data is story in a number of nodes that are stored in an array. Each node stores an array of
+meta data. Meta data can describe anything you want, and a lot of conventions will use meta data
+to store additional information, for things like transforms, lights, shaders and animation.
+Data for Vertices, Corners, Faces, and Pixels are stored in named layer stacks. Each stack consists
+of a number of named layers. All layers in the stack have the same number of elements. Each layer
+describes one property of the primitive. Each layer can have multiple channels and each layer can
+store data of a different type.
+
+HaX stores 3 kinds of nodes
+- Pixel data.
+- Polygon geometry data.
+- Meta data only.
+
+Pixel Nodes stores pixels in a layer stack. A layer may store things like Albedo, Roughness,
+Reflectance, Light maps, Masks, Normal maps, and Displacement. Layers use the channels of the
+layers to store things like color.
+The length of the layer stack is determined by the type and dimensions stored in the Geometry data
+is stored in 3 separate layer stacks for: vertex data, corner data and face data. The
+vertex data stores things like verities, blend shapes, weight maps, and vertex colors. The first
+layer in a vertex stack has to be a 3 channel layer named "position" describing the base position
+of the vertices. The corner stack describes data per corner or edge of the polygons. It can be used
+for things like UV, normals, and adjacency. The first layer in a corner stack has to be a 1 channel
+integer layer named "index" describing the vertices used to form polygons. The last value in each
+polygon has a negative - 1 index to indicate the end of the polygon.
+
+For Example:
+	A quad and a tri with the vertex index:
+		[0, 1, 2, 3] [1, 4, 2]
+	is stored:
+		[0, 1, 2, -4, 1, 4, -3]
+
+The face stack stores values per face. the length of the face stack has to match the number of
+negative values in the index layer in the corner stack. The face stack can be used to store things
+like material index.
+
+Storage:  
+-------  
+All data is stored in little endian byte order with no padding. The layout mirrors the structs
+defined below with a few exceptions. All names are stored as a 8-bit unsigned integer indicating
+the length of the name followed by that many characters. Termination is not stored in the file.
+Text strings stored in meta data are stored the same way as names, but instead of a 8-bit unsigned
+integer a 32-bit unsigned integer is used.
+*/
+package encoding_hxa
@@ -116,7 +116,30 @@ assign_int :: proc(val: any, i: $T) -> bool {
 	case int:     dst = int    (i)
 	case uint:    dst = uint   (i)
 	case uintptr: dst = uintptr(i)
-	case: return false
+	case:
+		ti := type_info_of(v.id)
+		if _, ok := ti.variant.(runtime.Type_Info_Bit_Set); ok {
+			do_byte_swap := !reflect.bit_set_is_big_endian(v)
+			switch ti.size * 8 {
+			case 0: // no-op.
+			case 8:
+				x := (^u8)(v.data)
+				x^ = u8(i)
+			case 16:
+				x := (^u16)(v.data)
+				x^ = do_byte_swap ? intrinsics.byte_swap(u16(i)) : u16(i)
+			case 32:
+				x := (^u32)(v.data)
+				x^ = do_byte_swap ? intrinsics.byte_swap(u32(i)) : u32(i)
+			case 64:
+				x := (^u64)(v.data)
+				x^ = do_byte_swap ? intrinsics.byte_swap(u64(i)) : u64(i)
+			case:
+				panic("unknown bit_size size")
+			}
+			return true
+		}
+		return false
 	}
 	return true
 }
@@ -21,8 +21,9 @@ cryptographically-secure, per RFC 9562's suggestion.
 - Version 6 without either a clock or node argument.
 - Version 7 in all cases.

-Here's an example of how to set up one:
-	
+Example:	
+	package main
+
 	import "core:crypto"
 	import "core:encoding/uuid"

@@ -40,7 +41,7 @@ Here's an example of how to set up one:


 For more information on the specifications, see here:
- https://www.rfc-editor.org/rfc/rfc4122.html
- https://www.rfc-editor.org/rfc/rfc9562.html
+- [[ https://www.rfc-editor.org/rfc/rfc4122.html ]]
+- [[ https://www.rfc-editor.org/rfc/rfc9562.html ]]
 */
 package uuid
@@ -11,7 +11,7 @@ Write a UUID in the 8-4-4-4-12 format.
 This procedure performs error checking with every byte written.

 If you can guarantee beforehand that your stream has enough space to hold the
-UUID (32 bytes), then it is better to use `unsafe_write` instead as that will
+UUID (36 bytes), then it is better to use `unsafe_write` instead as that will
 be faster.

 Inputs:
@@ -22,7 +22,7 @@ Returns:
 - error: An `io` error, if one occurred, otherwise `nil`.
 */
 write :: proc(w: io.Writer, id: Identifier) -> (error: io.Error) #no_bounds_check {
-	write_octet :: proc (w: io.Writer, octet: u8) -> io.Error #no_bounds_check {
+	write_octet :: proc(w: io.Writer, octet: u8) -> io.Error #no_bounds_check {
 		high_nibble := octet >> 4
 		low_nibble := octet & 0xF

@@ -31,15 +31,15 @@ write :: proc(w: io.Writer, id: Identifier) -> (error: io.Error) #no_bounds_chec
 		return nil
 	}

-	for index in  0 ..<  4 { write_octet(w, id[index]) or_return }
+	for index in 0 ..< 4 {write_octet(w, id[index]) or_return}
 	io.write_byte(w, '-') or_return
-	for index in  4 ..<  6 { write_octet(w, id[index]) or_return }
+	for index in 4 ..< 6 {write_octet(w, id[index]) or_return}
 	io.write_byte(w, '-') or_return
-	for index in  6 ..<  8 { write_octet(w, id[index]) or_return }
+	for index in 6 ..< 8 {write_octet(w, id[index]) or_return}
 	io.write_byte(w, '-') or_return
-	for index in  8 ..< 10 { write_octet(w, id[index]) or_return }
+	for index in 8 ..< 10 {write_octet(w, id[index]) or_return}
 	io.write_byte(w, '-') or_return
-	for index in 10 ..< 16 { write_octet(w, id[index]) or_return }
+	for index in 10 ..< 16 {write_octet(w, id[index]) or_return}

 	return nil
 }
@@ -54,7 +54,7 @@ Inputs:
 - id: The identifier to convert.
 */
 unsafe_write :: proc(w: io.Writer, id: Identifier) #no_bounds_check {
-	write_octet :: proc (w: io.Writer, octet: u8) #no_bounds_check {
+	write_octet :: proc(w: io.Writer, octet: u8) #no_bounds_check {
 		high_nibble := octet >> 4
 		low_nibble := octet & 0xF

@@ -62,15 +62,15 @@ unsafe_write :: proc(w: io.Writer, id: Identifier) #no_bounds_check {
 		io.write_byte(w, strconv.digits[low_nibble])
 	}

-	for index in  0 ..<  4 { write_octet(w, id[index]) }
+	for index in 0 ..< 4 {write_octet(w, id[index])}
 	io.write_byte(w, '-')
-	for index in  4 ..<  6 { write_octet(w, id[index]) }
+	for index in 4 ..< 6 {write_octet(w, id[index])}
 	io.write_byte(w, '-')
-	for index in  6 ..<  8 { write_octet(w, id[index]) }
+	for index in 6 ..< 8 {write_octet(w, id[index])}
 	io.write_byte(w, '-')
-	for index in  8 ..< 10 { write_octet(w, id[index]) }
+	for index in 8 ..< 10 {write_octet(w, id[index])}
 	io.write_byte(w, '-')
-	for index in 10 ..< 16 { write_octet(w, id[index]) }
+	for index in 10 ..< 16 {write_octet(w, id[index])}
 }

 /*
@@ -106,7 +106,7 @@ Convert a UUID to a string in the 8-4-4-4-12 format.

 Inputs:
 - id: The identifier to convert.
- buffer: A byte buffer to store the result. Must be at least 32 bytes large.
+- buffer: A byte buffer to store the result. Must be at least 36 bytes large.
 - loc: The caller location for debugging purposes (default: #caller_location)

 Returns:
@@ -119,7 +119,11 @@ to_string_buffer :: proc(
 ) -> (
 	str: string,
 ) {
-	assert(len(buffer) >= EXPECTED_LENGTH, "The buffer provided is not at least 32 bytes large.", loc)
+	assert(
+		len(buffer) >= EXPECTED_LENGTH,
+		"The buffer provided is not at least 36 bytes large.",
+		loc,
+	)
 	builder := strings.builder_from_bytes(buffer)
 	unsafe_write(strings.to_writer(&builder), id)
 	return strings.to_string(builder)
@@ -129,3 +133,4 @@ to_string :: proc {
 	to_string_allocated,
 	to_string_buffer,
 }
+
@@ -1,10 +1,11 @@
 /*
-	Implementation of the LEB128 variable integer encoding as used by DWARF encoding and DEX files, among others.
+Implementation of the LEB128 variable integer encoding as used by DWARF encoding and DEX files, among others.

-	Author of this Odin package: Jeroen van Rijn
+Author of this Odin package: Jeroen van Rijn
+
+Example:
+	package main

-	Example:
-	```odin
 	import "core:encoding/varint"
 	import "core:fmt"

@@ -22,7 +23,5 @@
 		assert(decoded_val == value && decode_size == encode_size && decode_err == .None)
 		fmt.printf("Decoded as %v, using %v byte%v\n", decoded_val, decode_size, "" if decode_size == 1 else "s")
 	}
-	```
-
 */
-package encoding_varint
+package encoding_varint
@@ -6,8 +6,6 @@
 		Jeroen van Rijn: Initial implementation.
 */

-// package varint implements variable length integer encoding and decoding using
-// the LEB128 format as used by DWARF debug info, Android .dex and other file formats.
 package encoding_varint

 // In theory we should use the bigint package. In practice, varints bigger than this indicate a corrupted file.
@@ -160,4 +158,4 @@ encode_ileb128 :: proc(buf: []u8, val: i128) -> (size: int, err: Error) {
 		buf[size - 1] = u8(low)
 	}
 	return
-}
+}
--- a/Show More
+++ b/Show More