Merge pull request #4175 from laytan/riscv-compiler

Support RISC-V for the compiler itself
Merge pull request #4176 from Feoramund/fix-context-error-msg
2026-07-05 11:11:37 -07:00 · 2024-09-02 00:20:04 +01:00 · 2024-09-02 00:18:15 +01:00 · 2024-09-01 17:50:50 -04:00 · 2024-09-01 21:42:47 +02:00 · 2024-09-01 17:22:58 +02:00
639 changed files with 64198 additions and 9891 deletions
@@ -6,7 +6,7 @@ jobs:
    name: NetBSD Build, Check, and Test
    runs-on: ubuntu-latest
    env:
-      PKGSRC_BRANCH: 2024Q1
+      PKGSRC_BRANCH: 2024Q2
    steps:
    - uses: actions/checkout@v4
    - name: Build, Check, and Test
@@ -18,13 +18,11 @@ jobs:
        usesh: true
        copyback: false
        prepare: |
-          PKG_PATH="https://cdn.NetBSD.org/pub/pkgsrc/packages/NetBSD/$(uname -p)/$(uname -r | cut -d_ -f1)_${PKGSRC_BRANCH}/All" /usr/sbin/pkg_add pkgin
-          pkgin -y in gmake git bash python311
-          pkgin -y in libxml2 perl zstd
-          /usr/sbin/pkg_add https://github.com/andreas-jonsson/llvm17-netbsd-bin/releases/download/pkgsrc-current/llvm-17.0.6.tgz
-          /usr/sbin/pkg_add https://github.com/andreas-jonsson/llvm17-netbsd-bin/releases/download/pkgsrc-current/clang-17.0.6.tgz
+          PKG_PATH="https://cdn.NetBSD.org/pub/pkgsrc/packages/NetBSD/amd64/$(uname -r | cut -d_ -f1)_${PKGSRC_BRANCH}/All" /usr/sbin/pkg_add pkgin
+          pkgin -y in gmake git bash python311 llvm clang
          ln -s /usr/pkg/bin/python3.11 /usr/bin/python3
        run: |
+          set -e -x
          git config --global --add safe.directory $(pwd)
          gmake release
          ./odin version
@@ -34,10 +32,9 @@ jobs:
          gmake -C vendor/miniaudio/src
          ./odin check examples/all -vet -strict-style -disallow-do -target:netbsd_amd64
          ./odin check examples/all -vet -strict-style -disallow-do -target:netbsd_arm64
-          ./odin test tests/core/normal.odin -file -all-packages -define:ODIN_TEST_FANCY=false
-          ./odin test tests/core/speed.odin -file -all-packages -o:speed -define:ODIN_TEST_FANCY=false
-          ./odin test tests/vendor -all-packages -define:ODIN_TEST_FANCY=false
-          ./odin test tests/benchmark -all-packages -define:ODIN_TEST_FANCY=false
+          ./odin test tests/core/normal.odin -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
+          ./odin test tests/core/speed.odin -file -all-packages -vet -strict-style -disallow-do -o:speed -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
+          ./odin test tests/vendor -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
          (cd tests/issues; ./run.sh)
  build_freebsd:
    name: FreeBSD Build, Check, and Test
@@ -63,10 +60,9 @@ jobs:
          gmake -C vendor/cgltf/src
          gmake -C vendor/miniaudio/src
          ./odin check examples/all -vet -strict-style -disallow-do -target:freebsd_amd64
-          ./odin test tests/core/normal.odin -file -all-packages -define:ODIN_TEST_FANCY=false
-          ./odin test tests/core/speed.odin -file -all-packages -o:speed -define:ODIN_TEST_FANCY=false
-          ./odin test tests/vendor -all-packages -define:ODIN_TEST_FANCY=false
-          ./odin test tests/benchmark -all-packages -define:ODIN_TEST_FANCY=false
+          ./odin test tests/core/normal.odin -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
+          ./odin test tests/core/speed.odin -file -all-packages -vet -strict-style -disallow-do -o:speed -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
+          ./odin test tests/vendor -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
          (cd tests/issues; ./run.sh)
  ci:
    strategy:
@@ -91,13 +87,13 @@ jobs:
      - name: Download LLVM (MacOS Intel)
        if: matrix.os == 'macos-13'
        run: |
-          brew install llvm@17
+          brew install llvm@17 lua@5.4
          echo "/usr/local/opt/llvm@17/bin" >> $GITHUB_PATH

      - name: Download LLVM (MacOS ARM)
        if: matrix.os == 'macos-14'
        run: |
-          brew install llvm@17 wasmtime
+          brew install llvm@17 wasmtime lua@5.4
          echo "/opt/homebrew/opt/llvm@17/bin" >> $GITHUB_PATH

      - name: Build Odin
@@ -120,15 +116,13 @@ jobs:
      - name: Odin check examples/all
        run: ./odin check examples/all -strict-style
      - name: Normal Core library tests
-        run: ./odin test tests/core/normal.odin -file -all-packages -define:ODIN_TEST_FANCY=false
+        run: ./odin test tests/core/normal.odin -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
      - name: Optimized Core library tests
-        run: ./odin test tests/core/speed.odin -o:speed -file -all-packages -define:ODIN_TEST_FANCY=false
+        run: ./odin test tests/core/speed.odin -o:speed -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
      - name: Vendor library tests
-        run: ./odin test tests/vendor -all-packages -define:ODIN_TEST_FANCY=false
+        run: ./odin test tests/vendor -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
      - name: Internals tests
-        run: ./odin test tests/internal -all-packages -define:ODIN_TEST_FANCY=false
-      - name: Core library benchmarks
-        run: ./odin test tests/benchmark -all-packages -define:ODIN_TEST_FANCY=false
+        run: ./odin test tests/internal -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
      - name: GitHub Issue tests
        run: |
          cd tests/issues
@@ -182,37 +176,33 @@ jobs:
        shell: cmd
        run: |
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat
-          odin run examples/demo -debug
+          odin run examples/demo -debug -vet -strict-style -disallow-do
      - name: Odin check examples/all
        shell: cmd
        run: |
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat
-          odin check examples/all -strict-style
+          odin check examples/all -vet -strict-style -disallow-do
      - name: Core library tests
        shell: cmd
        run: |
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat
-          odin test tests/core/normal.odin -file -all-packages -define:ODIN_TEST_FANCY=false
+          odin test tests/core/normal.odin -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
      - name: Optimized core library tests
        shell: cmd
        run: |
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat
-          odin test tests/core/speed.odin -o:speed -file -all-packages -define:ODIN_TEST_FANCY=false
-      - name: Core library benchmarks
-        shell: cmd
-        run: |
-          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat
-          odin test tests/benchmark -all-packages -define:ODIN_TEST_FANCY=false
+          odin test tests/core/speed.odin -o:speed -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
      - name: Vendor library tests
        shell: cmd
        run: |
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat
-          odin test tests/vendor -all-packages -define:ODIN_TEST_FANCY=false
+          copy vendor\lua\5.4\windows\*.dll .
+          odin test tests/vendor -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
      - name: Odin internals tests
        shell: cmd
        run: |
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat
-          odin test tests/internal -all-packages -define:ODIN_TEST_FANCY=false
+          odin test tests/internal -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true
      - name: Odin documentation tests
        shell: cmd
        run: |
@@ -230,3 +220,53 @@ jobs:
        run: |
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat
          odin check examples/all -strict-style -target:windows_i386
+
+  build_linux_riscv64:
+    runs-on: ubuntu-latest
+    name: Linux riscv64 (emulated) Build, Check and Test
+    timeout-minutes: 15
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Download LLVM (Linux)
+        run: |
+          wget https://apt.llvm.org/llvm.sh
+          chmod +x llvm.sh
+          sudo ./llvm.sh 18
+          echo "/usr/lib/llvm-18/bin" >> $GITHUB_PATH
+
+      - name: Build Odin
+        run: ./build_odin.sh release
+
+      - name: Odin version
+        run: ./odin version
+
+      - name: Odin report
+        run: ./odin report
+
+      - name: Compile needed Vendor
+        run: |
+          make -C vendor/stb/src
+          make -C vendor/cgltf/src
+          make -C vendor/miniaudio/src
+
+      - name: Odin check
+        run: ./odin check examples/all -target:linux_riscv64 -vet -strict-style -disallow-do
+
+      - name: Install riscv64 toolchain and qemu
+        run: sudo apt-get install -y qemu-user qemu-user-static gcc-12-riscv64-linux-gnu libc6-riscv64-cross
+
+      - name: Odin run
+        run: ./odin run examples/demo -vet -strict-style -disallow-do -target:linux_riscv64 -extra-linker-flags:"-fuse-ld=/usr/bin/riscv64-linux-gnu-gcc-12 -static -Wl,-static"
+
+      - name: Odin run -debug
+        run: ./odin run examples/demo -debug -vet -strict-style -disallow-do -target:linux_riscv64 -extra-linker-flags:"-fuse-ld=/usr/bin/riscv64-linux-gnu-gcc-12 -static -Wl,-static"
+
+      - name: Normal Core library tests
+        run: ./odin test tests/core/normal.odin -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true -target:linux_riscv64 -extra-linker-flags:"-fuse-ld=/usr/bin/riscv64-linux-gnu-gcc-12 -static -Wl,-static"
+
+      - name: Optimized Core library tests
+        run: ./odin test tests/core/speed.odin -o:speed -file -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true -target:linux_riscv64 -extra-linker-flags:"-fuse-ld=/usr/bin/riscv64-linux-gnu-gcc-12 -static -Wl,-static"
+
+      - name: Internals tests
+        run: ./odin test tests/internal -all-packages -vet -strict-style -disallow-do -define:ODIN_TEST_FANCY=false -define:ODIN_TEST_FAIL_ON_BAD_MEMORY=true -target:linux_riscv64 -extra-linker-flags:"-fuse-ld=/usr/bin/riscv64-linux-gnu-gcc-12 -static -Wl,-static"
@@ -50,8 +50,8 @@ jobs:
        run: |
          wget https://apt.llvm.org/llvm.sh
          chmod +x llvm.sh
-          sudo ./llvm.sh 17
-          echo "/usr/lib/llvm-17/bin" >> $GITHUB_PATH
+          sudo ./llvm.sh 18
+          echo "/usr/lib/llvm-18/bin" >> $GITHUB_PATH
      - name: build odin
        run: make nightly
      - name: Odin run
@@ -61,7 +61,6 @@ jobs:
          mkdir dist
          cp odin dist
          cp LICENSE dist
-          cp libLLVM* dist
          cp -r shared dist
          cp -r base dist
          cp -r core dist
@@ -82,8 +81,8 @@ jobs:
      - uses: actions/checkout@v4
      - name: Download LLVM and setup PATH
        run: |
-          brew install llvm@17 dylibbundler
-          echo "/usr/local/opt/llvm@17/bin" >> $GITHUB_PATH
+          brew install llvm@18 dylibbundler
+          echo "/usr/local/opt/llvm@18/bin" >> $GITHUB_PATH
      - name: build odin
        # These -L makes the linker prioritize system libraries over LLVM libraries, this is mainly to
        # not link with libunwind bundled with LLVM but link with libunwind on the system.
@@ -116,8 +115,8 @@ jobs:
      - uses: actions/checkout@v4
      - name: Download LLVM and setup PATH
        run: |
-          brew install llvm@17 dylibbundler
-          echo "/opt/homebrew/opt/llvm@17/bin" >> $GITHUB_PATH
+          brew install llvm@18 dylibbundler
+          echo "/opt/homebrew/opt/llvm@18/bin" >> $GITHUB_PATH
      - name: build odin
        # These -L makes the linker prioritize system libraries over LLVM libraries, this is mainly to
        # not link with libunwind bundled with LLVM but link with libunwind on the system.
@@ -17,45 +17,12 @@
 [Rr]eleases/
 x64/
 x86/
+!/core/simd/x86
 bld/
 [Bb]in/
 [Oo]bj/
 [Ll]og/
 ![Cc]ore/[Ll]og/
-tests/documentation/verify/
-tests/documentation/all.odin-doc
-tests/internal/test_map
-tests/internal/test_pow
-tests/internal/test_rtti
-tests/core/test_base64
-tests/core/test_cbor
-tests/core/test_core_compress
-tests/core/test_core_container
-tests/core/test_core_filepath
-tests/core/test_core_fmt
-tests/core/test_core_i18n
-tests/core/test_core_image
-tests/core/test_core_libc
-tests/core/test_core_match
-tests/core/test_core_math
-tests/core/test_core_net
-tests/core/test_core_os_exit
-tests/core/test_core_reflect
-tests/core/test_core_strings
-tests/core/test_core_time
-tests/core/test_crypto
-tests/core/test_hash
-tests/core/test_hex
-tests/core/test_hxa
-tests/core/test_json
-tests/core/test_linalg_glsl_math
-tests/core/test_noise
-tests/core/test_varint
-tests/core/test_xml
-tests/core/test_core_slice
-tests/core/test_core_thread
-tests/core/test_core_runtime
-tests/vendor/vendor_botan
 # Visual Studio 2015 cache/options directory
 .vs/
 # Visual Studio Code options directory
@@ -1,4 +1,4 @@
-all: debug
+all: default

 demo:
 	./odin run examples/demo/demo.odin -file
@@ -6,12 +6,18 @@ demo:
 report:
 	./odin report

+default:
+	PROGRAM=make ./build_odin.sh # debug
+
 debug:
 	./build_odin.sh debug

 release:
 	./build_odin.sh release

+release-native:
+	./build_odin.sh release-native
+
 release_native:
 	./build_odin.sh release-native

@@ -76,9 +76,9 @@ Answers to common questions about Odin.

 Documentation for all the official packages part of the [core](https://pkg.odin-lang.org/core/) and [vendor](https://pkg.odin-lang.org/vendor/) library collections.

-#### [The Odin Wiki](https://github.com/odin-lang/Odin/wiki)
+#### [Odin Documentation](https://odin-lang.org/docs/)

-A wiki maintained by the Odin community.
+Documentation for the Odin language itself.

 #### [Odin Discord](https://discord.gg/sVBPHEv)

@@ -38,9 +38,12 @@ count_leading_zeros  :: proc(x: $T) -> T where type_is_integer(T) || type_is_sim
 reverse_bits         :: proc(x: $T) -> T where type_is_integer(T) || type_is_simd_vector(T) ---
 byte_swap            :: proc(x: $T) -> T where type_is_integer(T) || type_is_float(T) ---

-overflow_add :: proc(lhs, rhs: $T) -> (T, bool) ---
-overflow_sub :: proc(lhs, rhs: $T) -> (T, bool) ---
-overflow_mul :: proc(lhs, rhs: $T) -> (T, bool) ---
+overflow_add :: proc(lhs, rhs: $T) -> (T, bool) where type_is_integer(T) #optional_ok ---
+overflow_sub :: proc(lhs, rhs: $T) -> (T, bool) where type_is_integer(T) #optional_ok ---
+overflow_mul :: proc(lhs, rhs: $T) -> (T, bool) where type_is_integer(T) #optional_ok ---
+
+saturating_add :: proc(lhs, rhs: $T) -> T where type_is_integer(T) ---
+saturating_sub :: proc(lhs, rhs: $T) -> T where type_is_integer(T) ---

 sqrt :: proc(x: $T) -> T where type_is_float(T) || (type_is_simd_vector(T) && type_is_float(type_elem_type(T))) ---

@@ -216,14 +219,21 @@ type_map_cell_info :: proc($T: typeid)           -> ^runtime.Map_Cell_Info ---
 type_convert_variants_to_pointers :: proc($T: typeid) -> typeid where type_is_union(T) ---
 type_merge :: proc($U, $V: typeid) -> typeid where type_is_union(U), type_is_union(V) ---

+type_has_shared_fields :: proc($U, $V: typeid) -> bool typeid where type_is_struct(U), type_is_struct(V) ---
+
 constant_utf16_cstring :: proc($literal: string) -> [^]u16 ---

+constant_log2 :: proc($v: $T) -> T where type_is_integer(T) ---
+
 // SIMD related
 simd_add  :: proc(a, b: #simd[N]T) -> #simd[N]T ---
 simd_sub  :: proc(a, b: #simd[N]T) -> #simd[N]T ---
 simd_mul  :: proc(a, b: #simd[N]T) -> #simd[N]T ---
 simd_div  :: proc(a, b: #simd[N]T) -> #simd[N]T where type_is_float(T) ---

+simd_saturating_add  :: proc(a, b: #simd[N]T) -> #simd[N]T where type_is_integer(T) ---
+simd_saturating_sub  :: proc(a, b: #simd[N]T) -> #simd[N]T where type_is_integer(T) ---
+
 // Keeps Odin's Behaviour
 // (x << y) if y <= mask else 0
 simd_shl :: proc(a: #simd[N]T, b: #simd[N]Unsigned_Integer) -> #simd[N]T ---
@@ -234,9 +244,6 @@ simd_shr :: proc(a: #simd[N]T, b: #simd[N]Unsigned_Integer) -> #simd[N]T ---
 simd_shl_masked :: proc(a: #simd[N]T, b: #simd[N]Unsigned_Integer) -> #simd[N]T ---
 simd_shr_masked :: proc(a: #simd[N]T, b: #simd[N]Unsigned_Integer) -> #simd[N]T ---

-simd_add_sat :: proc(a, b: #simd[N]T) -> #simd[N]T ---
-simd_sub_sat :: proc(a, b: #simd[N]T) -> #simd[N]T ---
-
 simd_bit_and     :: proc(a, b: #simd[N]T) -> #simd[N]T ---
 simd_bit_or      :: proc(a, b: #simd[N]T) -> #simd[N]T ---
 simd_bit_xor     :: proc(a, b: #simd[N]T) -> #simd[N]T ---
@@ -265,13 +272,28 @@ simd_lanes_ge :: proc(a, b: #simd[N]T) -> #simd[N]Integer ---
 simd_extract :: proc(a: #simd[N]T, idx: uint) -> T ---
 simd_replace :: proc(a: #simd[N]T, idx: uint, elem: T) -> #simd[N]T ---

-simd_reduce_add_ordered :: proc(a: #simd[N]T) -> T ---
-simd_reduce_mul_ordered :: proc(a: #simd[N]T) -> T ---
-simd_reduce_min         :: proc(a: #simd[N]T) -> T ---
-simd_reduce_max         :: proc(a: #simd[N]T) -> T ---
-simd_reduce_and         :: proc(a: #simd[N]T) -> T ---
-simd_reduce_or          :: proc(a: #simd[N]T) -> T ---
-simd_reduce_xor         :: proc(a: #simd[N]T) -> T ---
+simd_reduce_add_ordered :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
+simd_reduce_mul_ordered :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
+simd_reduce_min         :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
+simd_reduce_max         :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
+simd_reduce_and         :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
+simd_reduce_or          :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
+simd_reduce_xor         :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
+
+simd_reduce_any         :: proc(a: #simd[N]T) -> T where type_is_boolean(T) ---
+simd_reduce_all         :: proc(a: #simd[N]T) -> T where type_is_boolean(T) ---
+
+
+simd_gather       :: proc(ptr: #simd[N]rawptr, val: #simd[N]T, mask: #simd[N]U) -> #simd[N]T where type_is_integer(U) || type_is_boolean(U) ---
+simd_scatter      :: proc(ptr: #simd[N]rawptr, val: #simd[N]T, mask: #simd[N]U)              where type_is_integer(U) || type_is_boolean(U) ---
+
+simd_masked_load  :: proc(ptr: rawptr, val: #simd[N]T, mask: #simd[N]U) -> #simd[N]T where type_is_integer(U) || type_is_boolean(U) ---
+simd_masked_store :: proc(ptr: rawptr, val: #simd[N]T, mask: #simd[N]U)              where type_is_integer(U) || type_is_boolean(U) ---
+
+simd_masked_expand_load    :: proc(ptr: rawptr, val: #simd[N]T, mask: #simd[N]U) -> #simd[N]T where type_is_integer(U) || type_is_boolean(U) ---
+simd_masked_compress_store :: proc(ptr: rawptr, val: #simd[N]T, mask: #simd[N]U)              where type_is_integer(U) || type_is_boolean(U) ---
+
+

 simd_shuffle :: proc(a, b: #simd[N]T, indices: ..int) -> #simd[len(indices)]T ---
 simd_select  :: proc(cond: #simd[N]boolean_or_integer, true, false: #simd[N]T) -> #simd[N]T ---
@@ -285,11 +307,11 @@ simd_nearest :: proc(a: #simd[N]any_float) -> #simd[N]any_float ---

 simd_to_bits :: proc(v: #simd[N]T) -> #simd[N]Integer where size_of(T) == size_of(Integer), type_is_unsigned(Integer) ---

-// equivalent a swizzle with descending indices, e.g. reserve(a, 3, 2, 1, 0)
-simd_reverse :: proc(a: #simd[N]T) -> #simd[N]T ---
+// equivalent to a swizzle with descending indices, e.g. reserve(a, 3, 2, 1, 0)
+simd_lanes_reverse :: proc(a: #simd[N]T) -> #simd[N]T ---

-simd_rotate_left  :: proc(a: #simd[N]T, $offset: int) -> #simd[N]T ---
-simd_rotate_right :: proc(a: #simd[N]T, $offset: int) -> #simd[N]T ---
+simd_lanes_rotate_left  :: proc(a: #simd[N]T, $offset: int) -> #simd[N]T ---
+simd_lanes_rotate_right :: proc(a: #simd[N]T, $offset: int) -> #simd[N]T ---

 // Checks if the current target supports the given target features.
 //
@@ -66,7 +66,7 @@ Type_Info_Named :: struct {
 	name: string,
 	base: ^Type_Info,
 	pkg:  string,
-	loc:  Source_Code_Location,
+	loc:  ^Source_Code_Location,
 }
 Type_Info_Integer    :: struct {signed: bool, endianness: Platform_Endianness}
 Type_Info_Rune       :: struct {}
@@ -112,23 +112,32 @@ Type_Info_Parameters :: struct { // Only used for procedures parameters and resu
 }
 Type_Info_Tuple :: Type_Info_Parameters // Will be removed eventually

-Type_Info_Struct :: struct {
-	types:        []^Type_Info,
-	names:        []string,
-	offsets:      []uintptr,
-	usings:       []bool,
-	tags:         []string,
-	is_packed:    bool,
-	is_raw_union: bool,
-	is_no_copy:   bool,
-	custom_align: bool,
+Type_Info_Struct_Flags :: distinct bit_set[Type_Info_Struct_Flag; u8]
+Type_Info_Struct_Flag :: enum u8 {
+	packed    = 0,
+	raw_union = 1,
+	no_copy   = 2,
+	align     = 3,
+}

-	equal: Equal_Proc, // set only when the struct has .Comparable set but does not have .Simple_Compare set
+Type_Info_Struct :: struct {
+	// Slice these with `field_count`
+	types:   [^]^Type_Info `fmt:"v,field_count"`,
+	names:   [^]string     `fmt:"v,field_count"`,
+	offsets: [^]uintptr    `fmt:"v,field_count"`,
+	usings:  [^]bool       `fmt:"v,field_count"`,
+	tags:    [^]string     `fmt:"v,field_count"`,
+
+	field_count: i32,
+
+	flags: Type_Info_Struct_Flags,

 	// These are only set iff this structure is an SOA structure
 	soa_kind:      Type_Info_Struct_Soa_Kind,
+	soa_len:       i32,
 	soa_base_type: ^Type_Info,
-	soa_len:       int,
+
+	equal: Equal_Proc, // set only when the struct has .Comparable set but does not have .Simple_Compare set
 }
 Type_Info_Union :: struct {
 	variants:     []^Type_Info,
@@ -142,9 +151,9 @@ Type_Info_Union :: struct {
 	shared_nil:   bool,
 }
 Type_Info_Enum :: struct {
-	base:      ^Type_Info,
-	names:     []string,
-	values:    []Type_Info_Enum_Value,
+	base:   ^Type_Info,
+	names:  []string,
+	values: []Type_Info_Enum_Value,
 }
 Type_Info_Map :: struct {
 	key:      ^Type_Info,
@@ -187,11 +196,12 @@ Type_Info_Soa_Pointer :: struct {
 }
 Type_Info_Bit_Field :: struct {
 	backing_type: ^Type_Info,
-	names:        []string,
-	types:        []^Type_Info,
-	bit_sizes:    []uintptr,
-	bit_offsets:  []uintptr,
-	tags:         []string,
+	names:        [^]string     `fmt:"v,field_count"`,
+	types:        [^]^Type_Info `fmt:"v,field_count"`,
+	bit_sizes:    [^]uintptr    `fmt:"v,field_count"`,
+	bit_offsets:  [^]uintptr    `fmt:"v,field_count"`,
+	tags:         [^]string     `fmt:"v,field_count"`,
+	field_count:  int,
 }

 Type_Info_Flag :: enum u8 {
@@ -299,6 +309,8 @@ when ODIN_OS == .Windows {
 		Thread_Detach  = 3,
 	}
 	dll_forward_reason: DLL_Forward_Reason
+
+	dll_instance: rawptr
 }

 // IMPORTANT NOTE(bill): Must be in this order (as the compiler relies upon it)
@@ -513,11 +525,12 @@ Raw_Quaternion256_Vector_Scalar :: struct {vector: [3]f64, scalar: f64}
 		Linux,
 		Essence,
 		FreeBSD,
-		Haiku,
 		OpenBSD,
 		NetBSD,
+		Haiku,
 		WASI,
 		JS,
+		Orca,
 		Freestanding,
 	}
 */
@@ -533,10 +546,23 @@ Odin_OS_Type :: type_of(ODIN_OS)
 		arm64,
 		wasm32,
 		wasm64p32,
+		riscv64,
 	}
 */
 Odin_Arch_Type :: type_of(ODIN_ARCH)

+Odin_Arch_Types :: bit_set[Odin_Arch_Type]
+
+ALL_ODIN_ARCH_TYPES :: Odin_Arch_Types{
+	.amd64,
+	.i386,
+	.arm32,
+	.arm64,
+	.wasm32,
+	.wasm64p32,
+	.riscv64,
+}
+
 /*
 	// Defined internally by the compiler
 	Odin_Build_Mode_Type :: enum int {
@@ -560,6 +586,22 @@ Odin_Build_Mode_Type :: type_of(ODIN_BUILD_MODE)
 */
 Odin_Endian_Type :: type_of(ODIN_ENDIAN)

+Odin_OS_Types :: bit_set[Odin_OS_Type]
+
+ALL_ODIN_OS_TYPES :: Odin_OS_Types{
+	.Windows,
+	.Darwin,
+	.Linux,
+	.Essence,
+	.FreeBSD,
+	.OpenBSD,
+	.NetBSD,
+	.Haiku,
+	.WASI,
+	.JS,
+	.Orca,
+	.Freestanding,
+}

 /*
 	// Defined internally by the compiler
@@ -577,7 +619,7 @@ Odin_Platform_Subtarget_Type :: type_of(ODIN_PLATFORM_SUBTARGET)
 		Memory  = 1,
 		Thread  = 2,
 	}
-	Odin_Sanitizer_Flags :: distinct bitset[Odin_Sanitizer_Flag; u32]
+	Odin_Sanitizer_Flags :: distinct bit_set[Odin_Sanitizer_Flag; u32]

 	ODIN_SANITIZER_FLAGS // is a constant
 */
@@ -737,6 +779,10 @@ __init_context :: proc "contextless" (c: ^Context) {
 }

 default_assertion_failure_proc :: proc(prefix, message: string, loc: Source_Code_Location) -> ! {
+	default_assertion_contextless_failure_proc(prefix, message, loc)
+}
+
+default_assertion_contextless_failure_proc :: proc "contextless" (prefix, message: string, loc: Source_Code_Location) -> ! {
 	when ODIN_OS == .Freestanding {
 		// Do nothing
 	} else {
@@ -333,17 +333,24 @@ make_dynamic_array_len :: proc($T: typeid/[dynamic]$E, #any_int len: int, alloca
 // Note: Prefer using the procedure group `make`.
@(builtin, require_results)
 make_dynamic_array_len_cap :: proc($T: typeid/[dynamic]$E, #any_int len: int, #any_int cap: int, allocator := context.allocator, loc := #caller_location) -> (array: T, err: Allocator_Error) #optional_allocator_error {
-	make_dynamic_array_error_loc(loc, len, cap)
-	array.allocator = allocator // initialize allocator before just in case it fails to allocate any memory
-	data := mem_alloc_bytes(size_of(E)*cap, align_of(E), allocator, loc) or_return
-	s := Raw_Dynamic_Array{raw_data(data), len, cap, allocator}
-	if data == nil && size_of(E) != 0 {
-		s.len, s.cap = 0, 0
-	}
-	array = transmute(T)s
+	err = _make_dynamic_array_len_cap((^Raw_Dynamic_Array)(&array), size_of(E), align_of(E), len, cap, allocator, loc)
 	return
 }
-// `make_map` allocates and initializes a dynamic array. Like `new`, the first argument is a type, not a value.
+
+@(require_results)
+_make_dynamic_array_len_cap :: proc(array: ^Raw_Dynamic_Array, size_of_elem, align_of_elem: int, #any_int len: int, #any_int cap: int, allocator := context.allocator, loc := #caller_location) -> (err: Allocator_Error) {
+	make_dynamic_array_error_loc(loc, len, cap)
+	array.allocator = allocator // initialize allocator before just in case it fails to allocate any memory
+	data := mem_alloc_bytes(size_of_elem*cap, align_of_elem, allocator, loc) or_return
+	use_zero := data == nil && size_of_elem != 0
+	array.data = raw_data(data)
+	array.len = 0 if use_zero else len
+	array.cap = 0 if use_zero else cap
+	array.allocator = allocator
+	return
+}
+
+// `make_map` allocates and initializes a map. Like `new`, the first argument is a type, not a value.
 // Unlike `new`, `make`'s return value is the same as the type of its argument, not a pointer to it.
 //
 // Note: Prefer using the procedure group `make`.
@@ -355,7 +362,7 @@ make_map :: proc($T: typeid/map[$K]$E, #any_int capacity: int = 1<<MAP_MIN_LOG2_
 	err = reserve_map(&m, capacity, loc)
 	return
 }
-// `make_multi_pointer` allocates and initializes a dynamic array. Like `new`, the first argument is a type, not a value.
+// `make_multi_pointer` allocates and initializes a multi-pointer. Like `new`, the first argument is a type, not a value.
 // Unlike `new`, `make`'s return value is the same as the type of its argument, not a pointer to it.
 //
 // This is "similar" to doing `raw_data(make([]E, len, allocator))`.
@@ -440,107 +447,103 @@ delete_key :: proc(m: ^$T/map[$K]$V, key: K) -> (deleted_key: K, deleted_value:
 	return
 }

-_append_elem :: #force_inline proc(array: ^$T/[dynamic]$E, arg: E, should_zero: bool, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
+_append_elem :: #force_inline proc(array: ^Raw_Dynamic_Array, size_of_elem, align_of_elem: int, arg_ptr: rawptr, should_zero: bool, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
 	if array == nil {
-		return 0, nil
+		return
 	}
-	when size_of(E) == 0 {
-		array := (^Raw_Dynamic_Array)(array)
-		array.len += 1
-		return 1, nil
-	} else {
-		if cap(array) < len(array)+1 {
-			// Same behavior as _append_elems but there's only one arg, so we always just add DEFAULT_DYNAMIC_ARRAY_CAPACITY.
-			cap := 2 * cap(array) + DEFAULT_DYNAMIC_ARRAY_CAPACITY

-			// do not 'or_return' here as it could be a partial success
-			if should_zero {
-				err = reserve(array, cap, loc)
-			} else {
-				err = non_zero_reserve(array, cap, loc) 
-			}
-		}
-		if cap(array)-len(array) > 0 {
-			a := (^Raw_Dynamic_Array)(array)
-			when size_of(E) != 0 {
-				data := ([^]E)(a.data)
-				assert(data != nil, loc=loc)
-				data[a.len] = arg
-			}
-			a.len += 1
-			return 1, err
-		}
-		return 0, err
+	if array.cap < array.len+1 {
+		// Same behavior as _append_elems but there's only one arg, so we always just add DEFAULT_DYNAMIC_ARRAY_CAPACITY.
+		cap := 2 * array.cap + DEFAULT_DYNAMIC_ARRAY_CAPACITY
+
+		// do not 'or_return' here as it could be a partial success
+		err = _reserve_dynamic_array(array, size_of_elem, align_of_elem, cap, should_zero, loc)
 	}
+	if array.cap-array.len > 0 {
+		data := ([^]byte)(array.data)
+		assert(data != nil, loc=loc)
+		data = data[array.len*size_of_elem:]
+		intrinsics.mem_copy_non_overlapping(data, arg_ptr, size_of_elem)
+		array.len += 1
+		n = 1
+	}
+	return
 }

@builtin
 append_elem :: proc(array: ^$T/[dynamic]$E, #no_broadcast arg: E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
-	return _append_elem(array, arg, true, loc=loc)
+	when size_of(E) == 0 {
+		(^Raw_Dynamic_Array)(array).len += 1
+		return 1, nil
+	} else {
+		arg := arg
+		return _append_elem((^Raw_Dynamic_Array)(array), size_of(E), align_of(E), &arg, true, loc=loc)
+	}
 }

@builtin
 non_zero_append_elem :: proc(array: ^$T/[dynamic]$E, #no_broadcast arg: E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
-	return _append_elem(array, arg, false, loc=loc)
+	when size_of(E) == 0 {
+		(^Raw_Dynamic_Array)(array).len += 1
+		return 1, nil
+	} else {
+		arg := arg
+		return _append_elem((^Raw_Dynamic_Array)(array), size_of(E), align_of(E), &arg, false, loc=loc)
+	}
 }

-_append_elems :: #force_inline proc(array: ^$T/[dynamic]$E, should_zero: bool, loc := #caller_location, args: ..E) -> (n: int, err: Allocator_Error) #optional_allocator_error {
+_append_elems :: #force_inline proc(array: ^Raw_Dynamic_Array, size_of_elem, align_of_elem: int, should_zero: bool, loc := #caller_location, args: rawptr, arg_len: int) -> (n: int, err: Allocator_Error) #optional_allocator_error {
 	if array == nil {
 		return 0, nil
 	}

-	arg_len := len(args)
 	if arg_len <= 0 {
 		return 0, nil
 	}

-	when size_of(E) == 0 {
-		array := (^Raw_Dynamic_Array)(array)
-		array.len += arg_len
-		return arg_len, nil
-	} else {
-		if cap(array) < len(array)+arg_len {
-			cap := 2 * cap(array) + max(DEFAULT_DYNAMIC_ARRAY_CAPACITY, arg_len)
+	if array.cap < array.len+arg_len {
+		cap := 2 * array.cap + max(DEFAULT_DYNAMIC_ARRAY_CAPACITY, arg_len)

-			// do not 'or_return' here as it could be a partial success
-			if should_zero {
-				err = reserve(array, cap, loc)
-			} else {
-				err = non_zero_reserve(array, cap, loc)
-			}
-		}
-		arg_len = min(cap(array)-len(array), arg_len)
-		if arg_len > 0 {
-			a := (^Raw_Dynamic_Array)(array)
-			when size_of(E) != 0 {
-				data := ([^]E)(a.data)
-				assert(data != nil, loc=loc)
-				intrinsics.mem_copy(&data[a.len], raw_data(args), size_of(E) * arg_len)
-			}
-			a.len += arg_len
-		}
-		return arg_len, err
+		// do not 'or_return' here as it could be a partial success
+		err = _reserve_dynamic_array(array, size_of_elem, align_of_elem, cap, should_zero, loc)
 	}
+	arg_len := arg_len
+	arg_len = min(array.cap-array.len, arg_len)
+	if arg_len > 0 {
+		data := ([^]byte)(array.data)
+		assert(data != nil, loc=loc)
+		data = data[array.len*size_of_elem:]
+		intrinsics.mem_copy(data, args, size_of_elem * arg_len) // must be mem_copy (overlapping)
+		array.len += arg_len
+	}
+	return arg_len, err
 }

@builtin
 append_elems :: proc(array: ^$T/[dynamic]$E, #no_broadcast args: ..E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
-	return _append_elems(array, true, loc, ..args)
+	when size_of(E) == 0 {
+		a := (^Raw_Dynamic_Array)(array)
+		a.len += len(args)
+		return len(args), nil
+	} else {
+		return _append_elems((^Raw_Dynamic_Array)(array), size_of(E), align_of(E), true, loc, raw_data(args), len(args))
+	}
 }

@builtin
 non_zero_append_elems :: proc(array: ^$T/[dynamic]$E, #no_broadcast args: ..E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
-	return _append_elems(array, false, loc, ..args)
+	when size_of(E) == 0 {
+		a := (^Raw_Dynamic_Array)(array)
+		a.len += len(args)
+		return len(args), nil
+	} else {
+		return _append_elems((^Raw_Dynamic_Array)(array), size_of(E), align_of(E), false, loc, raw_data(args), len(args))
+	}
 }

 // The append_string built-in procedure appends a string to the end of a [dynamic]u8 like type
 _append_elem_string :: proc(array: ^$T/[dynamic]$E/u8, arg: $A/string, should_zero: bool, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
-	args := transmute([]E)arg
-	if should_zero { 
-		return append_elems(array, ..args, loc=loc)
-	} else {
-		return non_zero_append_elems(array, ..args, loc=loc)
-	}
+	return _append_elems((^Raw_Dynamic_Array)(array), 1, 1, should_zero, loc, raw_data(arg), len(arg))
 }

@builtin
@@ -679,7 +682,7 @@ assign_at_elem :: proc(array: ^$T/[dynamic]$E, index: int, arg: E, loc := #calle


@builtin
-assign_at_elems :: proc(array: ^$T/[dynamic]$E, index: int, args: ..E, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
+assign_at_elems :: proc(array: ^$T/[dynamic]$E, index: int, #no_broadcast args: ..E, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
 	new_size := index + len(args)
 	if len(args) == 0 {
 		ok = true
@@ -729,11 +732,10 @@ clear_dynamic_array :: proc "contextless" (array: ^$T/[dynamic]$E) {
 // `reserve_dynamic_array` will try to reserve memory of a passed dynamic array or map to the requested element count (setting the `cap`).
 //
 // Note: Prefer the procedure group `reserve`.
-_reserve_dynamic_array :: #force_inline proc(array: ^$T/[dynamic]$E, capacity: int, should_zero: bool, loc := #caller_location) -> Allocator_Error {
-	if array == nil {
+_reserve_dynamic_array :: #force_inline proc(a: ^Raw_Dynamic_Array, size_of_elem, align_of_elem: int, capacity: int, should_zero: bool, loc := #caller_location) -> Allocator_Error {
+	if a == nil {
 		return nil
 	}
-	a := (^Raw_Dynamic_Array)(array)

 	if capacity <= a.cap {
 		return nil
@@ -744,15 +746,15 @@ _reserve_dynamic_array :: #force_inline proc(array: ^$T/[dynamic]$E, capacity: i
 	}
 	assert(a.allocator.procedure != nil)

-	old_size  := a.cap * size_of(E)
-	new_size  := capacity * size_of(E)
+	old_size  := a.cap * size_of_elem
+	new_size  := capacity * size_of_elem
 	allocator := a.allocator

 	new_data: []byte
 	if should_zero {
-		new_data = mem_resize(a.data, old_size, new_size, align_of(E), allocator, loc) or_return
+		new_data = mem_resize(a.data, old_size, new_size, align_of_elem, allocator, loc) or_return
 	} else {
-		new_data = non_zero_mem_resize(a.data, old_size, new_size, align_of(E), allocator, loc) or_return
+		new_data = non_zero_mem_resize(a.data, old_size, new_size, align_of_elem, allocator, loc) or_return
 	}
 	if new_data == nil && new_size > 0 {
 		return .Out_Of_Memory
@@ -765,26 +767,23 @@ _reserve_dynamic_array :: #force_inline proc(array: ^$T/[dynamic]$E, capacity: i

@builtin
 reserve_dynamic_array :: proc(array: ^$T/[dynamic]$E, #any_int capacity: int, loc := #caller_location) -> Allocator_Error {
-	return _reserve_dynamic_array(array, capacity, true, loc)
+	return _reserve_dynamic_array((^Raw_Dynamic_Array)(array), size_of(E), align_of(E), capacity, true, loc)
 }

@builtin
 non_zero_reserve_dynamic_array :: proc(array: ^$T/[dynamic]$E, #any_int capacity: int, loc := #caller_location) -> Allocator_Error {
-	return _reserve_dynamic_array(array, capacity, false, loc)
+	return _reserve_dynamic_array((^Raw_Dynamic_Array)(array), size_of(E), align_of(E), capacity, false, loc)
 }

-// `resize_dynamic_array` will try to resize memory of a passed dynamic array or map to the requested element count (setting the `len`, and possibly `cap`).
-//
-// Note: Prefer the procedure group `resize`
-_resize_dynamic_array :: #force_inline proc(array: ^$T/[dynamic]$E, length: int, should_zero: bool, loc := #caller_location) -> Allocator_Error {
-	if array == nil {
+
+_resize_dynamic_array :: #force_inline proc(a: ^Raw_Dynamic_Array, size_of_elem, align_of_elem: int, length: int, should_zero: bool, loc := #caller_location) -> Allocator_Error {
+	if a == nil {
 		return nil
 	}
-	a := (^Raw_Dynamic_Array)(array)

 	if length <= a.cap {
 		if should_zero && a.len < length {
-			intrinsics.mem_zero(([^]E)(a.data)[a.len:], (length-a.len)*size_of(E))
+			intrinsics.mem_zero(([^]byte)(a.data)[a.len*size_of_elem:], (length-a.len)*size_of_elem)
 		}
 		a.len = max(length, 0)
 		return nil
@@ -795,15 +794,15 @@ _resize_dynamic_array :: #force_inline proc(array: ^$T/[dynamic]$E, length: int,
 	}
 	assert(a.allocator.procedure != nil)

-	old_size  := a.cap * size_of(E)
-	new_size  := length * size_of(E)
+	old_size  := a.cap  * size_of_elem
+	new_size  := length * size_of_elem
 	allocator := a.allocator

 	new_data : []byte
 	if should_zero {
-		new_data = mem_resize(a.data, old_size, new_size, align_of(E), allocator, loc) or_return
+		new_data = mem_resize(a.data, old_size, new_size, align_of_elem, allocator, loc) or_return
 	} else {
-		new_data = non_zero_mem_resize(a.data, old_size, new_size, align_of(E), allocator, loc) or_return
+		new_data = non_zero_mem_resize(a.data, old_size, new_size, align_of_elem, allocator, loc) or_return
 	}
 	if new_data == nil && new_size > 0 {
 		return .Out_Of_Memory
@@ -815,14 +814,17 @@ _resize_dynamic_array :: #force_inline proc(array: ^$T/[dynamic]$E, length: int,
 	return nil
 }

+// `resize_dynamic_array` will try to resize memory of a passed dynamic array or map to the requested element count (setting the `len`, and possibly `cap`).
+//
+// Note: Prefer the procedure group `resize`
@builtin
 resize_dynamic_array :: proc(array: ^$T/[dynamic]$E, #any_int length: int, loc := #caller_location) -> Allocator_Error {
-	return _resize_dynamic_array(array, length, true, loc=loc)
+	return _resize_dynamic_array((^Raw_Dynamic_Array)(array), size_of(E), align_of(E), length, true, loc=loc)
 }

@builtin
 non_zero_resize_dynamic_array :: proc(array: ^$T/[dynamic]$E, #any_int length: int, loc := #caller_location) -> Allocator_Error {
-	return _resize_dynamic_array(array, length, false, loc=loc)
+	return _resize_dynamic_array((^Raw_Dynamic_Array)(array), size_of(E), align_of(E), length, false, loc=loc)
 }

 /*
@@ -837,10 +839,13 @@ non_zero_resize_dynamic_array :: proc(array: ^$T/[dynamic]$E, #any_int length: i
 	Note: Prefer the procedure group `shrink`
 */
 shrink_dynamic_array :: proc(array: ^$T/[dynamic]$E, new_cap := -1, loc := #caller_location) -> (did_shrink: bool, err: Allocator_Error) {
-	if array == nil {
+	return _shrink_dynamic_array((^Raw_Dynamic_Array)(array), size_of(E), align_of(E), new_cap, loc)
+}
+
+_shrink_dynamic_array :: proc(a: ^Raw_Dynamic_Array, size_of_elem, align_of_elem: int, new_cap := -1, loc := #caller_location) -> (did_shrink: bool, err: Allocator_Error) {
+	if a == nil {
 		return
 	}
-	a := (^Raw_Dynamic_Array)(array)

 	new_cap := new_cap if new_cap >= 0 else a.len

@@ -853,10 +858,10 @@ shrink_dynamic_array :: proc(array: ^$T/[dynamic]$E, new_cap := -1, loc := #call
 	}
 	assert(a.allocator.procedure != nil)

-	old_size := a.cap * size_of(E)
-	new_size := new_cap * size_of(E)
+	old_size := a.cap * size_of_elem
+	new_size := new_cap * size_of_elem

-	new_data := mem_resize(a.data, old_size, new_size, align_of(E), a.allocator, loc) or_return
+	new_data := mem_resize(a.data, old_size, new_size, align_of_elem, a.allocator, loc) or_return

 	a.data = raw_data(new_data)
 	a.len = min(new_cap, a.len)
@@ -943,3 +948,30 @@ unimplemented :: proc(message := "", loc := #caller_location) -> ! {
 	}
 	p("not yet implemented", message, loc)
 }
+
+
+@builtin
+@(disabled=ODIN_DISABLE_ASSERT)
+assert_contextless :: proc "contextless" (condition: bool, message := "", loc := #caller_location) {
+	if !condition {
+		// NOTE(bill): This is wrapped in a procedure call
+		// to improve performance to make the CPU not
+		// execute speculatively, making it about an order of
+		// magnitude faster
+		@(cold)
+		internal :: proc "contextless" (message: string, loc: Source_Code_Location) {
+			default_assertion_contextless_failure_proc("runtime assertion", message, loc)
+		}
+		internal(message, loc)
+	}
+}
+
+@builtin
+panic_contextless :: proc "contextless" (message: string, loc := #caller_location) -> ! {
+	default_assertion_contextless_failure_proc("panic", message, loc)
+}
+
+@builtin
+unimplemented_contextless :: proc "contextless" (message := "", loc := #caller_location) -> ! {
+	default_assertion_contextless_failure_proc("not yet implemented", message, loc)
+}
@@ -352,7 +352,7 @@ non_zero_append_soa_elems :: proc(array: ^$T/#soa[dynamic]$E, #no_broadcast args
 }


-_append_soa_elems :: proc(array: ^$T/#soa[dynamic]$E, zero_memory: bool, #no_broadcast args: ..E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
+_append_soa_elems :: proc(array: ^$T/#soa[dynamic]$E, zero_memory: bool, #no_broadcast args: []E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
 	if array == nil {
 		return
 	}
@@ -1,8 +1,8 @@
 package runtime

 nil_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode,
-                               size, alignment: int,
-                               old_memory: rawptr, old_size: int, loc := #caller_location) -> ([]byte, Allocator_Error) {
+                           size, alignment: int,
+                           old_memory: rawptr, old_size: int, loc := #caller_location) -> ([]byte, Allocator_Error) {
 	switch mode {
 	case .Alloc, .Alloc_Non_Zeroed:
 		return nil, .Out_Of_Memory
@@ -129,7 +129,7 @@ arena_alloc :: proc(arena: ^Arena, size, alignment: uint, loc := #caller_locatio
 	return
 }

-// `arena_init` will initialize the arena with a usuable block.
+// `arena_init` will initialize the arena with a usable block.
 // This procedure is not necessary to use the Arena as the default zero as `arena_alloc` will set things up if necessary
@(require_results)
 arena_init :: proc(arena: ^Arena, size: uint, backing_allocator: Allocator, loc := #caller_location) -> Allocator_Error {
@@ -577,7 +577,7 @@ map_grow_dynamic :: proc "odin" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_Inf


@(require_results)
-map_reserve_dynamic :: proc "odin" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_Info, new_capacity: uintptr, loc := #caller_location) -> Allocator_Error {
+map_reserve_dynamic :: #force_no_inline proc "odin" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_Info, new_capacity: uintptr, loc := #caller_location) -> Allocator_Error {
 	@(require_results)
 	ceil_log2 :: #force_inline proc "contextless" (x: uintptr) -> uintptr {
 		z := intrinsics.count_leading_zeros(x)
@@ -641,7 +641,7 @@ map_reserve_dynamic :: proc "odin" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_


@(require_results)
-map_shrink_dynamic :: proc "odin" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_Info, loc := #caller_location) -> (did_shrink: bool, err: Allocator_Error) {
+map_shrink_dynamic :: #force_no_inline proc "odin" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_Info, loc := #caller_location) -> (did_shrink: bool, err: Allocator_Error) {
 	if m.allocator.procedure == nil {
 		m.allocator = context.allocator
 	}
@@ -688,7 +688,7 @@ map_shrink_dynamic :: proc "odin" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_I
 }

@(require_results)
-map_free_dynamic :: proc "odin" (m: Raw_Map, info: ^Map_Info, loc := #caller_location) -> Allocator_Error {
+map_free_dynamic :: #force_no_inline proc "odin" (m: Raw_Map, info: ^Map_Info, loc := #caller_location) -> Allocator_Error {
 	ptr := rawptr(map_data(m))
 	size := int(map_total_allocation_size(uintptr(map_cap(m)), info))
 	err := mem_free_with_size(ptr, size, m.allocator, loc)
@@ -700,7 +700,7 @@ map_free_dynamic :: proc "odin" (m: Raw_Map, info: ^Map_Info, loc := #caller_loc
 }

@(require_results)
-map_lookup_dynamic :: proc "contextless" (m: Raw_Map, #no_alias info: ^Map_Info, k: uintptr) -> (index: uintptr, ok: bool) {
+map_lookup_dynamic :: #force_no_inline proc "contextless" (m: Raw_Map, #no_alias info: ^Map_Info, k: uintptr) -> (index: uintptr, ok: bool) {
 	if map_len(m) == 0 {
 		return 0, false
 	}
@@ -723,7 +723,7 @@ map_lookup_dynamic :: proc "contextless" (m: Raw_Map, #no_alias info: ^Map_Info,
 	}
 }
@(require_results)
-map_exists_dynamic :: proc "contextless" (m: Raw_Map, #no_alias info: ^Map_Info, k: uintptr) -> (ok: bool) {
+map_exists_dynamic :: #force_no_inline proc "contextless" (m: Raw_Map, #no_alias info: ^Map_Info, k: uintptr) -> (ok: bool) {
 	if map_len(m) == 0 {
 		return false
 	}
@@ -749,7 +749,7 @@ map_exists_dynamic :: proc "contextless" (m: Raw_Map, #no_alias info: ^Map_Info,


@(require_results)
-map_erase_dynamic :: #force_inline proc "contextless" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_Info, k: uintptr) -> (old_k, old_v: uintptr, ok: bool) {
+map_erase_dynamic :: #force_no_inline proc "contextless" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_Info, k: uintptr) -> (old_k, old_v: uintptr, ok: bool) {
 	index := map_lookup_dynamic(m^, info, k) or_return
 	ks, vs, hs, _, _ := map_kvh_data_dynamic(m^, info)
 	hs[index] |= TOMBSTONE_MASK
@@ -34,6 +34,9 @@ when ODIN_BUILD_MODE == .Dynamic {
 		} else when ODIN_OS == .Darwin && ODIN_ARCH == .arm64 {
 			@require foreign import entry "entry_unix_no_crt_darwin_arm64.asm"
 			SYS_exit :: 1
+		} else when ODIN_ARCH == .riscv64 {
+			@require foreign import entry "entry_unix_no_crt_riscv64.asm"
+			SYS_exit :: 93
 		}
 		@(link_name="_start_odin", linkage="strong", require)
 		_start_odin :: proc "c" (argc: i32, argv: [^]cstring) -> ! {
@@ -0,0 +1,10 @@
+.text
+
+.globl _start
+
+_start:
+	ld a0, 0(sp)
+	addi a1, sp, 8
+	addi sp, sp, ~15
+	call _start_odin
+	ebreak
@@ -10,8 +10,9 @@ when ODIN_BUILD_MODE == .Dynamic {
 	DllMain :: proc "system" (hinstDLL: rawptr, fdwReason: u32, lpReserved: rawptr) -> b32 {
 		context = default_context()

-		// Populate Windows DLL-specific global
+		// Populate Windows DLL-specific globals
 		dll_forward_reason = DLL_Forward_Reason(fdwReason)
+		dll_instance       = hinstDLL

 		switch dll_forward_reason {
 		case .Process_Attach:
@@ -19,12 +19,15 @@ heap_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode,
 	// the pointer we return to the user.
 	//

-	aligned_alloc :: proc(size, alignment: int, old_ptr: rawptr = nil, zero_memory := true) -> ([]byte, Allocator_Error) {
+	aligned_alloc :: proc(size, alignment: int, old_ptr: rawptr, old_size: int, zero_memory := true) -> ([]byte, Allocator_Error) {
 		a := max(alignment, align_of(rawptr))
 		space := size + a - 1

 		allocated_mem: rawptr
-		if old_ptr != nil {
+
+		force_copy := old_ptr != nil && a > align_of(rawptr)
+
+		if !force_copy && old_ptr != nil {
 			original_old_ptr := ([^]rawptr)(old_ptr)[-1]
 			allocated_mem = heap_resize(original_old_ptr, space+size_of(rawptr))
 		} else {
@@ -36,12 +39,19 @@ heap_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode,
 		aligned_ptr := (ptr - 1 + uintptr(a)) & -uintptr(a)
 		diff := int(aligned_ptr - ptr)
 		if (size + diff) > space || allocated_mem == nil {
+			aligned_free(old_ptr)
+			aligned_free(allocated_mem)
 			return nil, .Out_Of_Memory
 		}

 		aligned_mem = rawptr(aligned_ptr)
 		([^]rawptr)(aligned_mem)[-1] = allocated_mem

+		if force_copy {
+			mem_copy_non_overlapping(aligned_mem, old_ptr, old_size)
+			aligned_free(old_ptr)
+		}
+
 		return byte_slice(aligned_mem, size), nil
 	}

@@ -53,10 +63,10 @@ heap_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode,

 	aligned_resize :: proc(p: rawptr, old_size: int, new_size: int, new_alignment: int, zero_memory := true) -> (new_memory: []byte, err: Allocator_Error) {
 		if p == nil {
-			return nil, nil
+			return aligned_alloc(new_size, new_alignment, nil, old_size, zero_memory)
 		}

-		new_memory = aligned_alloc(new_size, new_alignment, p, zero_memory) or_return
+		new_memory = aligned_alloc(new_size, new_alignment, p, old_size, zero_memory) or_return

 		// NOTE: heap_resize does not zero the new memory, so we do it
 		if zero_memory && new_size > old_size {
@@ -68,7 +78,7 @@ heap_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode,

 	switch mode {
 	case .Alloc, .Alloc_Non_Zeroed:
-		return aligned_alloc(size, alignment, nil, mode == .Alloc)
+		return aligned_alloc(size, alignment, nil, 0, mode == .Alloc)

 	case .Free:
 		aligned_free(old_memory)
@@ -77,9 +87,6 @@ heap_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode,
 		return nil, .Mode_Not_Implemented

 	case .Resize, .Resize_Non_Zeroed:
-		if old_memory == nil {
-			return aligned_alloc(size, alignment, nil, mode == .Resize)
-		}
 		return aligned_resize(old_memory, old_size, size, alignment, mode == .Resize)

 	case .Query_Features:
@@ -8,10 +8,9 @@ IS_WASM :: ODIN_ARCH == .wasm32 || ODIN_ARCH == .wasm64p32

@(private)
 RUNTIME_LINKAGE :: "strong" when (
-	(ODIN_USE_SEPARATE_MODULES || 
+	ODIN_USE_SEPARATE_MODULES || 
 	ODIN_BUILD_MODE == .Dynamic ||
-	!ODIN_NO_CRT) &&
-	!IS_WASM) else "internal"
+	!ODIN_NO_CRT) else "internal"
 RUNTIME_REQUIRE :: false // !ODIN_TILDE

@(private)
@@ -879,9 +878,6 @@ extendhfsf2 :: proc "c" (value: __float16) -> f32 {

@(link_name="__floattidf", linkage=RUNTIME_LINKAGE, require=RUNTIME_REQUIRE)
 floattidf :: proc "c" (a: i128) -> f64 {
-when IS_WASM {
-	return 0
-} else {
 	DBL_MANT_DIG :: 53
 	if a == 0 {
 		return 0.0
@@ -921,14 +917,10 @@ when IS_WASM {
 	fb[0] = u32(a)                           // mantissa-low
 	return transmute(f64)fb
 }
-}


@(link_name="__floattidf_unsigned", linkage=RUNTIME_LINKAGE, require=RUNTIME_REQUIRE)
 floattidf_unsigned :: proc "c" (a: u128) -> f64 {
-when IS_WASM {
-	return 0
-} else {
 	DBL_MANT_DIG :: 53
 	if a == 0 {
 		return 0.0
@@ -966,7 +958,6 @@ when IS_WASM {
 	fb[0] = u32(a)                           // mantissa-low
 	return transmute(f64)fb
 }
-}



@@ -1023,14 +1014,32 @@ modti3 :: proc "c" (a, b: i128) -> i128 {

@(link_name="__divmodti4", linkage=RUNTIME_LINKAGE, require=RUNTIME_REQUIRE)
 divmodti4 :: proc "c" (a, b: i128, rem: ^i128) -> i128 {
-	u := udivmod128(u128(a), u128(b), (^u128)(rem))
-	return i128(u)
+	s_a := a >> (128 - 1) // -1 if negative or 0
+	s_b := b >> (128 - 1)
+	an := (a ~ s_a) - s_a // absolute
+	bn := (b ~ s_b) - s_b
+
+	s_b   ~= s_a // quotient sign
+	u_s_b := u128(s_b)
+	u_s_a := u128(s_a)
+
+	r: u128 = ---
+	u := i128((udivmodti4(u128(an), u128(bn), &r) ~ u_s_b) - u_s_b) // negate if negative
+	rem^ = i128((r ~ u_s_a) - u_s_a)
+	return u
 }

@(link_name="__divti3", linkage=RUNTIME_LINKAGE, require=RUNTIME_REQUIRE)
 divti3 :: proc "c" (a, b: i128) -> i128 {
-	u := udivmodti4(u128(a), u128(b), nil)
-	return i128(u)
+	s_a := a >> (128 - 1) // -1 if negative or 0
+	s_b := b >> (128 - 1)
+	an := (a ~ s_a) - s_a // absolute
+	bn := (b ~ s_b) - s_b
+
+	s_a   ~= s_b // quotient sign
+	u_s_a := u128(s_a)
+
+	return i128((udivmodti4(u128(an), u128(bn), nil) ~ u_s_a) - u_s_a) // negate if negative
 }


@@ -5,11 +5,24 @@ package runtime
 import "base:intrinsics"

 _stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
-	WRITE  :: 0x2000004
 	STDERR :: 2
-	ret := intrinsics.syscall(WRITE, STDERR, uintptr(raw_data(data)), uintptr(len(data)))
-	if ret < 0 {
-		return 0, _OS_Errno(-ret)
+	when ODIN_NO_CRT {
+		WRITE  :: 0x2000004
+		ret := intrinsics.syscall(WRITE, STDERR, uintptr(raw_data(data)), uintptr(len(data)))
+		if ret < 0 {
+			return 0, _OS_Errno(-ret)
+		}
+		return int(ret), 0
+	} else {
+		foreign {
+			write   :: proc(handle: i32, buffer: [^]byte, count: uint) -> int ---
+			__error :: proc() -> ^i32 ---
+		}
+
+		if ret := write(STDERR, raw_data(data), len(data)); ret >= 0 {
+			return int(ret), 0
+		}
+
+		return 0, _OS_Errno(__error()^)
 	}
-	return int(ret), 0
 }
@@ -12,6 +12,8 @@ _stderr_write :: proc "contextless" (data: []byte) -> (int, _OS_Errno) {
 		SYS_write :: uintptr(4)
 	} else when ODIN_ARCH == .arm32 {
 		SYS_write :: uintptr(4)
+	} else when ODIN_ARCH == .riscv64 {
+		SYS_write :: uintptr(64)
 	}

 	stderr :: 2
@@ -262,7 +262,7 @@ print_typeid :: #force_no_inline proc "contextless" (id: typeid) {
 	}
 }

-@(optimization_mode="size")
+@(optimization_mode="favor_size")
 print_type :: #force_no_inline proc "contextless" (ti: ^Type_Info) {
 	if ti == nil {
 		print_string("nil")
@@ -401,15 +401,16 @@ print_type :: #force_no_inline proc "contextless" (ti: ^Type_Info) {
 		}

 		print_string("struct ")
-		if info.is_packed    { print_string("#packed ") }
-		if info.is_raw_union { print_string("#raw_union ") }
-		if info.custom_align {
+		if .packed    in info.flags { print_string("#packed ") }
+		if .raw_union in info.flags { print_string("#raw_union ") }
+		if .no_copy   in info.flags { print_string("#no_copy ") }
+		if .align in info.flags {
 			print_string("#align(")
 			print_u64(u64(ti.align))
 			print_string(") ")
 		}
 		print_byte('{')
-		for name, i in info.names {
+		for name, i in info.names[:info.field_count] {
 			if i > 0 { print_string(", ") }
 			print_string(name)
 			print_string(": ")
@@ -469,7 +470,7 @@ print_type :: #force_no_inline proc "contextless" (ti: ^Type_Info) {
 		print_string("bit_field ")
 		print_type(info.backing_type)
 		print_string(" {")
-		for name, i in info.names {
+		for name, i in info.names[:info.field_count] {
 			if i > 0 { print_string(", ") }
 			print_string(name)
 			print_string(": ")
@@ -52,3 +52,24 @@ udivti3 :: proc "c" (la, ha, lb, hb: u64) -> u128 {
 	b.lo, b.hi = lb, hb
 	return udivmodti4(a.all, b.all, nil)
 }
+
+@(link_name="__lshrti3", linkage="strong")
+__lshrti3 :: proc "c" (la, ha: u64, b: u32) -> i128 {
+	bits :: size_of(u32)*8
+
+	input, result: ti_int
+	input.lo = la
+	input.hi = ha
+
+	if b & bits != 0 {
+		result.hi = 0
+		result.lo = input.hi >> (b - bits)
+	} else if b == 0 {
+		return input.all
+	} else {
+		result.hi = input.hi >> b
+		result.lo = (input.hi << (bits - b)) | (input.lo >> b)
+	}
+
+	return result.all
+}
@@ -0,0 +1,34 @@
+package runtime
+
+Thread_Local_Cleaner :: #type proc "odin" ()
+
+@(private="file")
+thread_local_cleaners: [8]Thread_Local_Cleaner
+
+// Add a procedure that will be run at the end of a thread for the purpose of
+// deallocating state marked as `thread_local`.
+//
+// Intended to be called in an `init` procedure of a package with
+// dynamically-allocated memory that is stored in `thread_local` variables.
+add_thread_local_cleaner :: proc "contextless" (p: Thread_Local_Cleaner) {
+	for &v in thread_local_cleaners {
+		if v == nil {
+			v = p
+			return
+		}
+	}
+	panic_contextless("There are no more thread-local cleaner slots available.")
+}
+
+// Run all of the thread-local cleaner procedures.
+//
+// Intended to be called by the internals of a threading API at the end of a
+// thread's lifetime.
+run_thread_local_cleaners :: proc "odin" () {
+	for p in thread_local_cleaners {
+		if p == nil {
+			break
+		}
+		p()
+	}
+}
@@ -297,7 +297,8 @@ lock :: proc(a: ^WASM_Allocator) {
 					return
 				}

-				assert(intrinsics.wasm_memory_atomic_wait32((^u32)(&a.mu), u32(new_state), -1) != 0)
+				ret := intrinsics.wasm_memory_atomic_wait32((^u32)(&a.mu), u32(new_state), -1)
+				assert(ret != 0)
 				intrinsics.cpu_relax()
 			}
 		}
@@ -48,6 +48,9 @@ if "%2" == "1" (
 set odin_version_raw="dev-%curr_year%-%curr_month%"

 set compiler_flags= -nologo -Oi -TP -fp:precise -Gm- -MP -FC -EHsc- -GR- -GF
+rem Parse source code as utf-8 even on shift-jis and other codepages
+rem See https://learn.microsoft.com/en-us/cpp/build/reference/utf-8-set-source-and-executable-character-sets-to-utf-8?view=msvc-170
+set compiler_flags= %compiler_flags% /utf-8
 set compiler_defines= -DODIN_VERSION_RAW=\"%odin_version_raw%\"

 if not exist .git\ goto skip_git_hash
@@ -111,7 +114,10 @@ call build_vendor.bat
 if %errorlevel% neq 0 goto end_of_build

 rem If the demo doesn't run for you and your CPU is more than a decade old, try -microarch:native
-if %release_mode% EQU 0 odin run examples/demo -- Hellope World
+if %release_mode% EQU 0 odin run examples/demo -vet -strict-style -- Hellope World
+
+rem Many non-compiler devs seem to run debug build but don't realize.
+if %release_mode% EQU 0 echo: & echo Debug compiler built. Note: run "build.bat release" if you want a faster, release mode compiler.

 del *.obj > NUL 2> NUL

@@ -23,6 +23,14 @@ error() {
 	exit 1
 }

+# Brew advises people not to add llvm to their $PATH, so try and use brew to find it.
+if [ -z "$LLVM_CONFIG" ] &&  [ -n "$(command -v brew)" ]; then
+    if   [ -n "$(command -v $(brew --prefix llvm@18)/bin/llvm-config)" ]; then LLVM_CONFIG="$(brew --prefix llvm@18)/bin/llvm-config"
+    elif [ -n "$(command -v $(brew --prefix llvm@17)/bin/llvm-config)" ]; then LLVM_CONFIG="$(brew --prefix llvm@17)/bin/llvm-config"
+    elif [ -n "$(command -v $(brew --prefix llvm@14)/bin/llvm-config)" ]; then LLVM_CONFIG="$(brew --prefix llvm@14)/bin/llvm-config"
+    fi
+fi
+
 if [ -z "$LLVM_CONFIG" ]; then
 	# darwin, linux, openbsd
 	if   [ -n "$(command -v llvm-config-18)" ]; then LLVM_CONFIG="llvm-config-18"
@@ -95,7 +103,7 @@ Linux)
 	LDFLAGS="$LDFLAGS -ldl $($LLVM_CONFIG --libs core native --system-libs --libfiles)"
 	# Copy libLLVM*.so into current directory for linking
 	# NOTE: This is needed by the Linux release pipeline!
-	cp $(readlink -f $($LLVM_CONFIG --libfiles)) ./
+	# cp $(readlink -f $($LLVM_CONFIG --libfiles)) ./
 	LDFLAGS="$LDFLAGS -Wl,-rpath=\$ORIGIN"
 	;;
 OpenBSD)
@@ -144,12 +152,17 @@ build_odin() {
 }

 run_demo() {
-	./odin run examples/demo/demo.odin -file -- Hellope World
+	if [ $# -eq 0 ] || [ "$1" = "debug" ]; then
+		./odin run examples/demo -vet -strict-style -- Hellope World
+	fi
 }

 if [ $# -eq 0 ]; then
 	build_odin debug
 	run_demo
+
+	: ${PROGRAM:=$0}
+	printf "\nDebug compiler built. Note: run \"$PROGRAM release\" or \"$PROGRAM release-native\" if you want a faster, release mode compiler.\n"
 elif [ $# -eq 1 ]; then
 	case $1 in
 	report)
@@ -144,6 +144,9 @@ buffer_grow :: proc(b: ^Buffer, n: int, loc := #caller_location) {
 }

 buffer_write_at :: proc(b: ^Buffer, p: []byte, offset: int, loc := #caller_location) -> (n: int, err: io.Error) {
+	if len(p) == 0 {
+		return 0, nil
+	}
 	b.last_read = .Invalid
 	if offset < 0 {
 		err = .Invalid_Offset
@@ -246,10 +249,13 @@ buffer_read_ptr :: proc(b: ^Buffer, ptr: rawptr, size: int) -> (n: int, err: io.
 }

 buffer_read_at :: proc(b: ^Buffer, p: []byte, offset: int) -> (n: int, err: io.Error) {
+	if len(p) == 0 {
+		return 0, nil
+	}
 	b.last_read = .Invalid

 	if uint(offset) >= len(b.buf) {
-		err = .Invalid_Offset
+		err = .EOF
 		return
 	}
 	n = copy(p, b.buf[offset:])
@@ -310,6 +316,27 @@ buffer_unread_rune :: proc(b: ^Buffer) -> io.Error {
 	return nil
 }

+buffer_seek :: proc(b: ^Buffer, offset: i64, whence: io.Seek_From) -> (i64, io.Error) {
+	abs: i64
+	switch whence {
+	case .Start:
+		abs = offset
+	case .Current:
+		abs = i64(b.off) + offset
+	case .End:
+		abs = i64(len(b.buf)) + offset
+	case:
+		return 0, .Invalid_Whence
+	}
+
+	abs_int := int(abs)
+	if abs_int < 0 {
+		return 0, .Invalid_Offset
+	}
+	b.last_read = .Invalid
+	b.off = abs_int
+	return abs, nil
+}

 buffer_read_bytes :: proc(b: ^Buffer, delim: byte) -> (line: []byte, err: io.Error) {
 	i := index_byte(b.buf[b.off:], delim)
@@ -395,14 +422,17 @@ _buffer_proc :: proc(stream_data: rawptr, mode: io.Stream_Mode, p: []byte, offse
 		return io._i64_err(buffer_write(b, p))
 	case .Write_At:
 		return io._i64_err(buffer_write_at(b, p, int(offset)))
+	case .Seek:
+		n, err = buffer_seek(b, offset, whence)
+		return
 	case .Size:
-		n = i64(buffer_capacity(b))
+		n = i64(buffer_length(b))
 		return
 	case .Destroy:
 		buffer_destroy(b)
 		return
 	case .Query:
-		return io.query_utility({.Read, .Read_At, .Write, .Write_At, .Size, .Destroy})
+		return io.query_utility({.Read, .Read_At, .Write, .Write_At, .Seek, .Size, .Destroy, .Query})
 	}
 	return 0, .Empty
 }
@@ -1,9 +1,38 @@
 package bytes

+import "base:intrinsics"
 import "core:mem"
+import "core:simd"
 import "core:unicode"
 import "core:unicode/utf8"

+when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
+	@(private)
+	SCANNER_INDICES_256 : simd.u8x32 : {
+		0,  1,  2,  3,  4,  5,  6,  7,
+		8,  9, 10, 11, 12, 13, 14, 15,
+		16, 17, 18, 19, 20, 21, 22, 23,
+		24, 25, 26, 27, 28, 29, 30, 31,
+	}
+	@(private)
+	SCANNER_SENTINEL_MAX_256: simd.u8x32 : u8(0x00)
+	@(private)
+	SCANNER_SENTINEL_MIN_256: simd.u8x32 : u8(0xff)
+	@(private)
+	SIMD_REG_SIZE_256 :: 32
+}
+@(private)
+SCANNER_INDICES_128 : simd.u8x16 : {
+	0,  1,  2,  3,  4,  5,  6,  7,
+	8,  9, 10, 11, 12, 13, 14, 15,
+}
+@(private)
+SCANNER_SENTINEL_MAX_128: simd.u8x16 : u8(0x00)
+@(private)
+SCANNER_SENTINEL_MIN_128: simd.u8x16 : u8(0xff)
+@(private)
+SIMD_REG_SIZE_128 :: 16
+
 clone :: proc(s: []byte, allocator := context.allocator, loc := #caller_location) -> []byte {
 	c := make([]byte, len(s), allocator, loc)
 	copy(c, s)
@@ -293,28 +322,279 @@ split_after_iterator :: proc(s: ^[]byte, sep: []byte) -> ([]byte, bool) {
 	return _split_iterator(s, sep, len(sep))
 }

+/*
+Scan a slice of bytes for a specific byte.

-index_byte :: proc(s: []byte, c: byte) -> int {
-	for i := 0; i < len(s); i += 1 {
+This procedure safely handles slices of any length, including empty slices.
+
+Inputs:
+- data: A slice of bytes.
+- c: The byte to search for.
+
+Returns:
+- index: The index of the byte `c`, or -1 if it was not found.
+*/
+index_byte :: proc(s: []byte, c: byte) -> (index: int) #no_bounds_check {
+	i, l := 0, len(s)
+
+	// Guard against small strings.  On modern systems, it is ALWAYS
+	// worth vectorizing assuming there is a hardware vector unit, and
+	// the data size is large enough.
+	if l < SIMD_REG_SIZE_128 {
+		for /**/; i < l; i += 1 {
+			if s[i] == c {
+				return i
+			}
+		}
+		return -1
+	}
+
+	c_vec: simd.u8x16 = c
+	when !simd.IS_EMULATED {
+		// Note: While this is something that could also logically take
+		// advantage of AVX512, the various downclocking and power
+		// consumption related woes make premature to have a dedicated
+		// code path.
+		when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
+			c_vec_256: simd.u8x32 = c
+
+			s_vecs: [4]simd.u8x32 = ---
+			c_vecs: [4]simd.u8x32 = ---
+			m_vec: [4]u8 = ---
+
+			// Scan 128-byte chunks, using 256-bit SIMD.
+			for nr_blocks := l / (4 * SIMD_REG_SIZE_256); nr_blocks > 0; nr_blocks -= 1 {
+				#unroll for j in 0..<4 {
+					s_vecs[j] = intrinsics.unaligned_load(cast(^simd.u8x32)raw_data(s[i+j*SIMD_REG_SIZE_256:]))
+					c_vecs[j] = simd.lanes_eq(s_vecs[j], c_vec_256)
+					m_vec[j] = simd.reduce_or(c_vecs[j])
+				}
+				if m_vec[0] | m_vec[1] | m_vec[2] | m_vec[3] > 0 {
+					#unroll for j in 0..<4 {
+						if m_vec[j] > 0 {
+							sel := simd.select(c_vecs[j], SCANNER_INDICES_256, SCANNER_SENTINEL_MIN_256)
+							off := simd.reduce_min(sel)
+							return i + j * SIMD_REG_SIZE_256 + int(off)
+						}
+					}
+				}
+
+				i += 4 * SIMD_REG_SIZE_256
+			}
+
+			// Scan 64-byte chunks, using 256-bit SIMD.
+			for nr_blocks := (l - i) / (2 * SIMD_REG_SIZE_256); nr_blocks > 0; nr_blocks -= 1 {
+				#unroll for j in 0..<2 {
+					s_vecs[j] = intrinsics.unaligned_load(cast(^simd.u8x32)raw_data(s[i+j*SIMD_REG_SIZE_256:]))
+					c_vecs[j] = simd.lanes_eq(s_vecs[j], c_vec_256)
+					m_vec[j] = simd.reduce_or(c_vecs[j])
+				}
+				if m_vec[0] | m_vec[1] > 0 {
+					#unroll for j in 0..<2 {
+						if m_vec[j] > 0 {
+							sel := simd.select(c_vecs[j], SCANNER_INDICES_256, SCANNER_SENTINEL_MIN_256)
+							off := simd.reduce_min(sel)
+							return i + j * SIMD_REG_SIZE_256 + int(off)
+						}
+					}
+				}
+
+				i += 2 * SIMD_REG_SIZE_256
+			}
+		} else {
+			s_vecs: [4]simd.u8x16 = ---
+			c_vecs: [4]simd.u8x16 = ---
+			m_vecs: [4]u8 = ---
+
+			// Scan 64-byte chunks, using 128-bit SIMD.
+			for nr_blocks := l / (4 * SIMD_REG_SIZE_128); nr_blocks > 0; nr_blocks -= 1 {
+				#unroll for j in 0..<4 {
+					s_vecs[j]= intrinsics.unaligned_load(cast(^simd.u8x16)raw_data(s[i+j*SIMD_REG_SIZE_128:]))
+					c_vecs[j] = simd.lanes_eq(s_vecs[j], c_vec)
+					m_vecs[j] = simd.reduce_or(c_vecs[j])
+				}
+				if m_vecs[0] | m_vecs[1] | m_vecs[2] | m_vecs[3] > 0 {
+					#unroll for j in 0..<4 {
+						if m_vecs[j] > 0 {
+							sel := simd.select(c_vecs[j], SCANNER_INDICES_128, SCANNER_SENTINEL_MIN_128)
+							off := simd.reduce_min(sel)
+							return i + j * SIMD_REG_SIZE_128 + int(off)
+						}
+					}
+				}
+
+				i += 4 * SIMD_REG_SIZE_128
+			}
+		}
+	}
+
+	// Scan the remaining SIMD register sized chunks.
+	//
+	// Apparently LLVM does ok with 128-bit SWAR, so this path is also taken
+	// on potato targets.  Scanning more at a time when LLVM is emulating SIMD
+	// likely does not buy much, as all that does is increase GP register
+	// pressure.
+	for nr_blocks := (l - i) / SIMD_REG_SIZE_128; nr_blocks > 0; nr_blocks -= 1 {
+		s0 := intrinsics.unaligned_load(cast(^simd.u8x16)raw_data(s[i:]))
+		c0 := simd.lanes_eq(s0, c_vec)
+		if simd.reduce_or(c0) > 0 {
+			sel := simd.select(c0, SCANNER_INDICES_128, SCANNER_SENTINEL_MIN_128)
+			off := simd.reduce_min(sel)
+			return i + int(off)
+		}
+
+		i += SIMD_REG_SIZE_128
+	}
+
+	// Scan serially for the remainder.
+	for /**/; i < l; i += 1 {
 		if s[i] == c {
 			return i
 		}
 	}
+
 	return -1
 }

-// Returns -1 if c is not present
-last_index_byte :: proc(s: []byte, c: byte) -> int {
-	for i := len(s)-1; i >= 0; i -= 1 {
+/*
+Scan a slice of bytes for a specific byte, starting from the end and working
+backwards to the start.
+
+This procedure safely handles slices of any length, including empty slices.
+
+Inputs:
+- data: A slice of bytes.
+- c: The byte to search for.
+
+Returns:
+- index: The index of the byte `c`, or -1 if it was not found.
+*/
+last_index_byte :: proc(s: []byte, c: byte) -> int #no_bounds_check {
+	i := len(s)
+
+	// Guard against small strings.  On modern systems, it is ALWAYS
+	// worth vectorizing assuming there is a hardware vector unit, and
+	// the data size is large enough.
+	if i < SIMD_REG_SIZE_128 {
+		if i > 0 { // Handle s == nil.
+			for /**/; i >= 0; i -= 1 {
+				if s[i] == c {
+					return i
+				}
+			}
+		}
+		return -1
+	}
+
+	c_vec: simd.u8x16 = c
+	when !simd.IS_EMULATED {
+		// Note: While this is something that could also logically take
+		// advantage of AVX512, the various downclocking and power
+		// consumption related woes make premature to have a dedicated
+		// code path.
+		when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
+			c_vec_256: simd.u8x32 = c
+
+			s_vecs: [4]simd.u8x32 = ---
+			c_vecs: [4]simd.u8x32 = ---
+			m_vec: [4]u8 = ---
+
+			// Scan 128-byte chunks, using 256-bit SIMD.
+			for i >= 4 * SIMD_REG_SIZE_256 {
+				i -= 4 * SIMD_REG_SIZE_256
+
+				#unroll for j in 0..<4 {
+					s_vecs[j] = intrinsics.unaligned_load(cast(^simd.u8x32)raw_data(s[i+j*SIMD_REG_SIZE_256:]))
+					c_vecs[j] = simd.lanes_eq(s_vecs[j], c_vec_256)
+					m_vec[j] = simd.reduce_or(c_vecs[j])
+				}
+				if m_vec[0] | m_vec[1] | m_vec[2] | m_vec[3] > 0 {
+					#unroll for j in 0..<4 {
+						if m_vec[3-j] > 0 {
+							sel := simd.select(c_vecs[3-j], SCANNER_INDICES_256, SCANNER_SENTINEL_MAX_256)
+							off := simd.reduce_max(sel)
+							return i + (3-j) * SIMD_REG_SIZE_256 + int(off)
+						}
+					}
+				}
+			}
+
+			// Scan 64-byte chunks, using 256-bit SIMD.
+			for i >= 2 * SIMD_REG_SIZE_256 {
+				i -= 2 * SIMD_REG_SIZE_256
+
+				#unroll for j in 0..<2 {
+					s_vecs[j] = intrinsics.unaligned_load(cast(^simd.u8x32)raw_data(s[i+j*SIMD_REG_SIZE_256:]))
+					c_vecs[j] = simd.lanes_eq(s_vecs[j], c_vec_256)
+					m_vec[j] = simd.reduce_or(c_vecs[j])
+				}
+				if m_vec[0] | m_vec[1] > 0 {
+					#unroll for j in 0..<2 {
+						if m_vec[1-j] > 0 {
+							sel := simd.select(c_vecs[1-j], SCANNER_INDICES_256, SCANNER_SENTINEL_MAX_256)
+							off := simd.reduce_max(sel)
+							return i + (1-j) * SIMD_REG_SIZE_256 + int(off)
+						}
+					}
+				}
+			}
+		} else {
+			s_vecs: [4]simd.u8x16 = ---
+			c_vecs: [4]simd.u8x16 = ---
+			m_vecs: [4]u8 = ---
+
+			// Scan 64-byte chunks, using 128-bit SIMD.
+			for i >= 4 * SIMD_REG_SIZE_128 {
+				i -= 4 * SIMD_REG_SIZE_128
+
+				#unroll for j in 0..<4 {
+					s_vecs[j] = intrinsics.unaligned_load(cast(^simd.u8x16)raw_data(s[i+j*SIMD_REG_SIZE_128:]))
+					c_vecs[j] = simd.lanes_eq(s_vecs[j], c_vec)
+					m_vecs[j] = simd.reduce_or(c_vecs[j])
+				}
+				if m_vecs[0] | m_vecs[1] | m_vecs[2] | m_vecs[3] > 0 {
+					#unroll for j in 0..<4 {
+						if m_vecs[3-j] > 0 {
+							sel := simd.select(c_vecs[3-j], SCANNER_INDICES_128, SCANNER_SENTINEL_MAX_128)
+							off := simd.reduce_max(sel)
+							return i + (3-j) * SIMD_REG_SIZE_128 + int(off)
+						}
+					}
+				}
+			}
+		}
+	}
+
+	// Scan the remaining SIMD register sized chunks.
+	//
+	// Apparently LLVM does ok with 128-bit SWAR, so this path is also taken
+	// on potato targets.  Scanning more at a time when LLVM is emulating SIMD
+	// likely does not buy much, as all that does is increase GP register
+	// pressure.
+	for i >= SIMD_REG_SIZE_128 {
+		i -= SIMD_REG_SIZE_128
+
+		s0 := intrinsics.unaligned_load(cast(^simd.u8x16)raw_data(s[i:]))
+		c0 := simd.lanes_eq(s0, c_vec)
+		if simd.reduce_or(c0) > 0 {
+			sel := simd.select(c0, SCANNER_INDICES_128, SCANNER_SENTINEL_MAX_128)
+			off := simd.reduce_max(sel)
+			return i + int(off)
+		}
+	}
+
+	// Scan serially for the remainder.
+	for i > 0 {
+		i -= 1
 		if s[i] == c {
 			return i
 		}
 	}
+
 	return -1
 }


-
@private PRIME_RABIN_KARP :: 16777619

 index :: proc(s, substr: []byte) -> int {
@@ -1167,3 +1447,28 @@ fields_proc :: proc(s: []byte, f: proc(rune) -> bool, allocator := context.alloc

 	return subslices[:]
 }
+
+// alias returns true iff a and b have a non-zero length, and any part of
+// a overlaps with b.
+alias :: proc "contextless" (a, b: []byte) -> bool {
+	a_len, b_len := len(a), len(b)
+	if a_len == 0 || b_len == 0 {
+		return false
+	}
+
+	a_start, b_start := uintptr(raw_data(a)), uintptr(raw_data(b))
+	a_end, b_end := a_start + uintptr(a_len-1), b_start + uintptr(b_len-1)
+
+	return a_start <= b_end && b_start <= a_end
+}
+
+// alias_inexactly returns true iff a and b have a non-zero length,
+// the base pointer of a and b are NOT equal, and any part of a overlaps
+// with b (ie: `alias(a, b)` with an exception that returns false for
+// `a == b`, `b = a[:len(a)-69]` and similar conditions).
+alias_inexactly :: proc "contextless" (a, b: []byte) -> bool {
+	if raw_data(a) == raw_data(b) {
+		return false
+	}
+	return alias(a, b)
+}
@@ -9,10 +9,11 @@ Reader :: struct {
 	prev_rune: int,    // previous reading index of rune or < 0
 }

-reader_init :: proc(r: ^Reader, s: []byte) {
+reader_init :: proc(r: ^Reader, s: []byte) -> io.Stream {
 	r.s = s
 	r.i = 0
 	r.prev_rune = -1
+	return reader_to_stream(r)
 }

 reader_to_stream :: proc(r: ^Reader) -> (s: io.Stream) {
@@ -33,6 +34,9 @@ reader_size :: proc(r: ^Reader) -> i64 {
 }

 reader_read :: proc(r: ^Reader, p: []byte) -> (n: int, err: io.Error) {
+	if len(p) == 0 {
+		return 0, nil
+	}
 	if r.i >= i64(len(r.s)) {
 		return 0, .EOF
 	}
@@ -42,6 +46,9 @@ reader_read :: proc(r: ^Reader, p: []byte) -> (n: int, err: io.Error) {
 	return
 }
 reader_read_at :: proc(r: ^Reader, p: []byte, off: i64) -> (n: int, err: io.Error) {
+	if len(p) == 0 {
+		return 0, nil
+	}
 	if off < 0 {
 		return 0, .Invalid_Offset
 	}
@@ -97,7 +104,6 @@ reader_unread_rune :: proc(r: ^Reader) -> io.Error {
 	return nil
 }
 reader_seek :: proc(r: ^Reader, offset: i64, whence: io.Seek_From) -> (i64, io.Error) {
-	r.prev_rune = -1
 	abs: i64
 	switch whence {
 	case .Start:
@@ -114,6 +120,7 @@ reader_seek :: proc(r: ^Reader, offset: i64, whence: io.Seek_From) -> (i64, io.E
 		return 0, .Invalid_Offset
 	}
 	r.i = abs
+	r.prev_rune = -1
 	return abs, nil
 }
 reader_write_to :: proc(r: ^Reader, w: io.Writer) -> (n: i64, err: io.Error) {
@@ -47,8 +47,8 @@ foreign libc {
 	clogf   :: proc(z: complex_float) -> complex_float ---

 	// 7.3.8 Power and absolute-value functions
-	cabs    :: proc(z: complex_double) -> complex_double ---
-	cabsf   :: proc(z: complex_float) -> complex_float ---
+	cabs    :: proc(z: complex_double) -> double ---
+	cabsf   :: proc(z: complex_float) -> float ---
 	cpow    :: proc(x, y: complex_double) -> complex_double ---
 	cpowf   :: proc(x, y: complex_float) -> complex_float ---
 	csqrt   :: proc(z: complex_double) -> complex_double ---
@@ -102,6 +102,6 @@ when ODIN_OS == .Haiku {
 // read the value, or to produce an lvalue such that you can assign a different
 // error value to errno. To work around this, just expose it as a function like
 // it actually is.
-errno :: #force_inline proc() -> ^int {
+errno :: #force_inline proc "contextless" () -> ^int {
 	return _get_errno()
 }
@@ -32,24 +32,21 @@ when ODIN_OS == .Windows {
 		// the RDX register will contain zero and correctly set the flag to disable
 		// stack unwinding.
 		@(link_name="_setjmp")
-		setjmp  :: proc(env: ^jmp_buf, hack: rawptr = nil) -> int ---
+		setjmp :: proc(env: ^jmp_buf, hack: rawptr = nil) -> int ---
 	}
 } else {
 	@(default_calling_convention="c")
 	foreign libc {
 		// 7.13.1 Save calling environment
-		//
-		// NOTE(dweiler): C11 requires setjmp be a macro, which means it won't
-		// necessarily export a symbol named setjmp but rather _setjmp in the case
-		// of musl, glibc, BSD libc, and msvcrt.
-		@(link_name="_setjmp")
-		setjmp  :: proc(env: ^jmp_buf) -> int ---
+		@(link_name=LSETJMP)
+		setjmp :: proc(env: ^jmp_buf) -> int ---
 	}
 }

@(default_calling_convention="c")
 foreign libc {
 	// 7.13.2 Restore calling environment
+	@(link_name=LLONGJMP)
 	longjmp :: proc(env: ^jmp_buf, val: int) -> ! ---
 }

@@ -64,3 +61,11 @@ foreign libc {
 // The choice of 4096 bytes for storage of this type is more than enough on all
 // relevant platforms.
 jmp_buf :: struct #align(16) { _: [4096]char, }
+
+when ODIN_OS == .NetBSD {
+	@(private) LSETJMP  :: "__setjmp14"
+	@(private) LLONGJMP :: "__longjmp14"
+} else {
+	@(private) LSETJMP  :: "setjmp"
+	@(private) LLONGJMP :: "longjmp"
+}
@@ -17,6 +17,12 @@ when ODIN_OS == .Windows {

 FILE :: struct {}

+Whence :: enum int {
+	SET = SEEK_SET,
+	CUR = SEEK_CUR,
+	END = SEEK_END,
+}
+
 // MSVCRT compatible.
 when ODIN_OS == .Windows {
 	_IOFBF       :: 0x0000
@@ -101,6 +107,8 @@ when ODIN_OS == .OpenBSD || ODIN_OS == .NetBSD {
 	SEEK_CUR :: 1
 	SEEK_END :: 2

+	TMP_MAX :: 308915776
+
 	foreign libc {
 		__sF: [3]FILE
 	}
@@ -128,6 +136,8 @@ when ODIN_OS == .FreeBSD {
 	SEEK_CUR :: 1
 	SEEK_END :: 2

+	TMP_MAX :: 308915776
+
 	foreign libc {
 		@(link_name="__stderrp") stderr: ^FILE
 		@(link_name="__stdinp")  stdin:  ^FILE
@@ -195,10 +205,21 @@ when ODIN_OS == .Haiku {
 	}
 }

+when ODIN_OS == .NetBSD {
+	@(private) LRENAME  :: "__posix_rename"
+	@(private) LFGETPOS :: "__fgetpos50"
+	@(private) LFSETPOS :: "__fsetpos50"
+} else {
+	@(private) LRENAME  :: "rename"
+	@(private) LFGETPOS :: "fgetpos"
+	@(private) LFSETPOS :: "fsetpos"
+}
+
@(default_calling_convention="c")
 foreign libc {
 	// 7.21.4 Operations on files
 	remove    :: proc(filename: cstring) -> int ---
+	@(link_name=LRENAME)
 	rename    :: proc(old, new: cstring) -> int ---
 	tmpfile   :: proc() -> ^FILE ---
 	tmpnam    :: proc(s: [^]char) -> [^]char ---
@@ -240,8 +261,10 @@ foreign libc {
 	fwrite    :: proc(ptr: rawptr, size: size_t, nmemb: size_t, stream: ^FILE) -> size_t ---

 	// 7.21.9 File positioning functions
+	@(link_name=LFGETPOS)
 	fgetpos   :: proc(stream: ^FILE, pos: ^fpos_t) -> int ---
-	fseek     :: proc(stream: ^FILE, offset: long, whence: int) -> int ---
+	fseek     :: proc(stream: ^FILE, offset: long, whence: Whence) -> int ---
+	@(link_name=LFSETPOS)
 	fsetpos   :: proc(stream: ^FILE, pos: ^fpos_t) -> int ---
 	ftell     :: proc(stream: ^FILE) -> long ---
 	rewind    :: proc(stream: ^FILE) ---
@@ -288,11 +311,11 @@ to_stream :: proc(file: ^FILE) -> io.Stream {
 				return 0, unknown_or_eof(file)
 			}

-			if fseek(file, long(offset), SEEK_SET) != 0 {
+			if fseek(file, long(offset), .SET) != 0 {
 				return 0, unknown_or_eof(file)
 			}

-			defer fseek(file, long(curr), SEEK_SET)
+			defer fseek(file, long(curr), .SET)

 			n = i64(fread(raw_data(p), size_of(byte), len(p), file))
 			if n == 0 { err = unknown_or_eof(file) }
@@ -307,17 +330,21 @@ to_stream :: proc(file: ^FILE) -> io.Stream {
 				return 0, unknown_or_eof(file)
 			}

-			if fseek(file, long(offset), SEEK_SET) != 0 {
+			if fseek(file, long(offset), .SET) != 0 {
 				return 0, unknown_or_eof(file)
 			}

-			defer fseek(file, long(curr), SEEK_SET)
+			defer fseek(file, long(curr), .SET)

 			n = i64(fwrite(raw_data(p), size_of(byte), len(p), file))
 			if n == 0 { err = unknown_or_eof(file) }

 		case .Seek:
-			if fseek(file, long(offset), int(whence)) != 0 {
+			#assert(int(Whence.SET) == int(io.Seek_From.Start))
+			#assert(int(Whence.CUR) == int(io.Seek_From.Current))
+			#assert(int(Whence.END) == int(io.Seek_From.End))
+
+			if fseek(file, long(offset), Whence(whence)) != 0 {
 				return 0, unknown_or_eof(file)
 			}
 		
@@ -326,9 +353,9 @@ to_stream :: proc(file: ^FILE) -> io.Stream {
 			if curr == -1 {
 				return 0, unknown_or_eof(file)
 			}
-			defer fseek(file, curr, SEEK_SET)
+			defer fseek(file, curr, .SET)

-			if fseek(file, 0, SEEK_END) != 0 {
+			if fseek(file, 0, .END) != 0 {
 				return 0, unknown_or_eof(file)
 			}

@@ -341,7 +368,7 @@ to_stream :: proc(file: ^FILE) -> io.Stream {
 			return 0, .Empty
 		
 		case .Query:
-			return io.query_utility({ .Close, .Flush, .Read, .Read_At, .Write, .Write_At, .Seek, .Size })
+			return io.query_utility({ .Close, .Flush, .Read, .Read_At, .Write, .Write_At, .Seek, .Size, .Query })
 		}
 		return
 	}
@@ -40,10 +40,9 @@ when ODIN_OS == .Linux {
 }


-when ODIN_OS == .Darwin {
+when ODIN_OS == .Darwin || ODIN_OS == .FreeBSD || ODIN_OS == .OpenBSD {
 	RAND_MAX :: 0x7fffffff

-	// GLIBC and MUSL only
 	@(private="file")
 	@(default_calling_convention="c")
 	foreign libc {
@@ -55,6 +54,20 @@ when ODIN_OS == .Darwin {
 	}
 }

+when ODIN_OS == .NetBSD {
+	RAND_MAX :: 0x7fffffff
+
+	@(private="file")
+	@(default_calling_convention="c")
+	foreign libc {
+		__mb_cur_max: size_t
+	}
+
+	MB_CUR_MAX :: #force_inline proc() -> size_t {
+		return __mb_cur_max
+	}
+}
+
 // C does not declare what these values should be, as an implementation is free
 // to use any two distinct values it wants to indicate success or failure.
 // However, nobody actually does and everyone appears to have agreed upon these
@@ -99,7 +112,7 @@ foreign libc {
 	at_quick_exit :: proc(func: proc "c" ()) -> int ---
 	exit          :: proc(status: int) -> ! ---
 	_Exit         :: proc(status: int) -> ! ---
-	getenv        :: proc(name: cstring) -> [^]char ---
+	getenv        :: proc(name: cstring) -> cstring ---
 	quick_exit    :: proc(status: int) -> ! ---
 	system        :: proc(cmd: cstring) -> int ---

@@ -150,4 +163,4 @@ aligned_free :: #force_inline proc "c" (ptr: rawptr) {
 	} else {
 		free(ptr)
 	}
-}
+}
@@ -40,7 +40,7 @@ foreign libc {
 	strtok   :: proc(s1: [^]char, s2: cstring) -> [^]char ---

 	// 7.24.6 Miscellaneous functions
-	strerror :: proc(errnum: int) -> [^]char ---
+	strerror :: proc(errnum: int) -> cstring ---
 	strlen   :: proc(s: cstring) -> size_t ---
 }
 memset :: proc "c" (s: rawptr, c: int, n: size_t) -> rawptr {
@@ -50,30 +50,56 @@ when ODIN_OS == .Linux || ODIN_OS == .FreeBSD || ODIN_OS == .Darwin || ODIN_OS =
 	foreign libc {
 		// 7.27.2 Time manipulation functions
 		clock        :: proc() -> clock_t ---
+		@(link_name=LDIFFTIME)
 		difftime     :: proc(time1, time2: time_t) -> double ---
+		@(link_name=LMKTIME)
 		mktime       :: proc(timeptr: ^tm) -> time_t ---
+		@(link_name=LTIME)
 		time         :: proc(timer: ^time_t) -> time_t ---
 		timespec_get :: proc(ts: ^timespec, base: int) -> int ---

 		// 7.27.3 Time conversion functions
 		asctime      :: proc(timeptr: ^tm) -> [^]char ---
+		@(link_name=LCTIME)
 		ctime        :: proc(timer: ^time_t) -> [^]char ---
+		@(link_name=LGMTIME)
 		gmtime       :: proc(timer: ^time_t) -> ^tm ---
+		@(link_name=LLOCALTIME)
 		localtime    :: proc(timer: ^time_t) -> ^tm ---
 		strftime     :: proc(s: [^]char, maxsize: size_t, format: cstring, timeptr: ^tm) -> size_t ---
 	}

+	when ODIN_OS == .NetBSD {
+		@(private) LDIFFTIME  :: "__difftime50"
+		@(private) LMKTIME    :: "__mktime50"
+		@(private) LTIME      :: "__time50"
+		@(private) LCTIME     :: "__ctime50"
+		@(private) LGMTIME    :: "__gmtime50"
+		@(private) LLOCALTIME :: "__localtime50"
+	} else {
+		@(private) LDIFFTIME  :: "difftime"
+		@(private) LMKTIME    :: "mktime"
+		@(private) LTIME      :: "time"
+		@(private) LCTIME     :: "ctime"
+		@(private) LGMTIME    :: "gmtime"
+		@(private) LLOCALTIME :: "localtime"
+	}
+
 	when ODIN_OS == .OpenBSD {
 		CLOCKS_PER_SEC :: 100
 	} else {
 		CLOCKS_PER_SEC :: 1000000
 	}

-	TIME_UTC       :: 1
+	TIME_UTC :: 1

-	time_t         :: distinct i64
+	time_t :: distinct i64

-	clock_t        :: long
+	when ODIN_OS == .FreeBSD || ODIN_OS == .NetBSD {
+		clock_t :: distinct int32_t
+	} else {
+		clock_t :: distinct long
+	}

 	timespec :: struct {
 		tv_sec:  time_t,
@@ -186,7 +186,7 @@ input_size_from_stream :: proc(z: ^Context_Stream_Input) -> (res: i64, err: Erro

 input_size :: proc{input_size_from_memory, input_size_from_stream}

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 read_slice_from_memory :: #force_inline proc(z: ^Context_Memory_Input, size: int) -> (res: []u8, err: io.Error) {
 	#no_bounds_check {
 		if len(z.input_data) >= size {
@@ -203,7 +203,7 @@ read_slice_from_memory :: #force_inline proc(z: ^Context_Memory_Input, size: int
 	}
 }

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 read_slice_from_stream :: #force_inline proc(z: ^Context_Stream_Input, size: int) -> (res: []u8, err: io.Error) {
 	// TODO: REMOVE ALL USE OF context.temp_allocator here
 	// there is literally no need for it
@@ -214,13 +214,13 @@ read_slice_from_stream :: #force_inline proc(z: ^Context_Stream_Input, size: int

 read_slice :: proc{read_slice_from_memory, read_slice_from_stream}

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 read_data :: #force_inline proc(z: ^$C, $T: typeid) -> (res: T, err: io.Error) {
 	b := read_slice(z, size_of(T)) or_return
 	return (^T)(&b[0])^, nil
 }

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 read_u8_from_memory :: #force_inline proc(z: ^Context_Memory_Input) -> (res: u8, err: io.Error) {
 	#no_bounds_check {
 		if len(z.input_data) >= 1 {
@@ -232,7 +232,7 @@ read_u8_from_memory :: #force_inline proc(z: ^Context_Memory_Input) -> (res: u8,
 	return 0, .EOF
 }

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 read_u8_from_stream :: #force_inline proc(z: ^Context_Stream_Input) -> (res: u8, err: io.Error) {
 	b := read_slice_from_stream(z, 1) or_return
 	return b[0], nil
@@ -242,7 +242,7 @@ read_u8 :: proc{read_u8_from_memory, read_u8_from_stream}

 // You would typically only use this at the end of Inflate, to drain bits from the code buffer
 // preferentially.
-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 read_u8_prefer_code_buffer_lsb :: #force_inline proc(z: ^$C) -> (res: u8, err: io.Error) {
 	if z.num_bits >= 8 {
 		res = u8(read_bits_no_refill_lsb(z, 8))
@@ -257,7 +257,7 @@ read_u8_prefer_code_buffer_lsb :: #force_inline proc(z: ^$C) -> (res: u8, err: i
 	return
 }

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 peek_data_from_memory :: #force_inline proc(z: ^Context_Memory_Input, $T: typeid) -> (res: T, err: io.Error) {
 	size :: size_of(T)

@@ -275,7 +275,7 @@ peek_data_from_memory :: #force_inline proc(z: ^Context_Memory_Input, $T: typeid
 	}
 }

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 peek_data_at_offset_from_memory :: #force_inline proc(z: ^Context_Memory_Input, $T: typeid, #any_int offset: int) -> (res: T, err: io.Error) {
 	size :: size_of(T)

@@ -293,7 +293,7 @@ peek_data_at_offset_from_memory :: #force_inline proc(z: ^Context_Memory_Input,
 	}
 }

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 peek_data_from_stream :: #force_inline proc(z: ^Context_Stream_Input, $T: typeid) -> (res: T, err: io.Error) {
 	size :: size_of(T)

@@ -317,7 +317,7 @@ peek_data_from_stream :: #force_inline proc(z: ^Context_Stream_Input, $T: typeid
 	return res, .None
 }

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 peek_data_at_offset_from_stream :: #force_inline proc(z: ^Context_Stream_Input, $T: typeid, #any_int offset: int) -> (res: T, err: io.Error) {
 	size :: size_of(T)

@@ -352,14 +352,14 @@ peek_data :: proc{peek_data_from_memory, peek_data_from_stream, peek_data_at_off


 // Sliding window read back
-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 peek_back_byte :: #force_inline proc(z: ^$C, offset: i64) -> (res: u8, err: io.Error) {
 	// Look back into the sliding window.
 	return z.output.buf[z.bytes_written - offset], .None
 }

 // Generalized bit reader LSB
-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 refill_lsb_from_memory :: #force_inline proc(z: ^Context_Memory_Input, width := i8(48)) {
 	refill := u64(width)
 	b      := u64(0)
@@ -385,7 +385,7 @@ refill_lsb_from_memory :: #force_inline proc(z: ^Context_Memory_Input, width :=
 }

 // Generalized bit reader LSB
-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 refill_lsb_from_stream :: proc(z: ^Context_Stream_Input, width := i8(24)) {
 	refill := u64(width)

@@ -414,13 +414,13 @@ refill_lsb_from_stream :: proc(z: ^Context_Stream_Input, width := i8(24)) {
 refill_lsb :: proc{refill_lsb_from_memory, refill_lsb_from_stream}


-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 consume_bits_lsb_from_memory :: #force_inline proc(z: ^Context_Memory_Input, width: u8) {
 	z.code_buffer >>= width
 	z.num_bits -= u64(width)
 }

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 consume_bits_lsb_from_stream :: #force_inline proc(z: ^Context_Stream_Input, width: u8) {
 	z.code_buffer >>= width
 	z.num_bits -= u64(width)
@@ -428,7 +428,7 @@ consume_bits_lsb_from_stream :: #force_inline proc(z: ^Context_Stream_Input, wid

 consume_bits_lsb :: proc{consume_bits_lsb_from_memory, consume_bits_lsb_from_stream}

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 peek_bits_lsb_from_memory :: #force_inline proc(z: ^Context_Memory_Input, width: u8) -> u32 {
 	if z.num_bits < u64(width) {
 		refill_lsb(z)
@@ -436,7 +436,7 @@ peek_bits_lsb_from_memory :: #force_inline proc(z: ^Context_Memory_Input, width:
 	return u32(z.code_buffer &~ (~u64(0) << width))
 }

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 peek_bits_lsb_from_stream :: #force_inline proc(z: ^Context_Stream_Input, width: u8) -> u32 {
 	if z.num_bits < u64(width) {
 		refill_lsb(z)
@@ -446,13 +446,13 @@ peek_bits_lsb_from_stream :: #force_inline proc(z: ^Context_Stream_Input, width:

 peek_bits_lsb :: proc{peek_bits_lsb_from_memory, peek_bits_lsb_from_stream}

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 peek_bits_no_refill_lsb_from_memory :: #force_inline proc(z: ^Context_Memory_Input, width: u8) -> u32 {
 	assert(z.num_bits >= u64(width))
 	return u32(z.code_buffer &~ (~u64(0) << width))
 }

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 peek_bits_no_refill_lsb_from_stream :: #force_inline proc(z: ^Context_Stream_Input, width: u8) -> u32 {
 	assert(z.num_bits >= u64(width))
 	return u32(z.code_buffer &~ (~u64(0) << width))
@@ -460,14 +460,14 @@ peek_bits_no_refill_lsb_from_stream :: #force_inline proc(z: ^Context_Stream_Inp

 peek_bits_no_refill_lsb :: proc{peek_bits_no_refill_lsb_from_memory, peek_bits_no_refill_lsb_from_stream}

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 read_bits_lsb_from_memory :: #force_inline proc(z: ^Context_Memory_Input, width: u8) -> u32 {
 	k := #force_inline peek_bits_lsb(z, width)
 	#force_inline consume_bits_lsb(z, width)
 	return k
 }

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 read_bits_lsb_from_stream :: #force_inline proc(z: ^Context_Stream_Input, width: u8) -> u32 {
 	k := peek_bits_lsb(z, width)
 	consume_bits_lsb(z, width)
@@ -476,14 +476,14 @@ read_bits_lsb_from_stream :: #force_inline proc(z: ^Context_Stream_Input, width:

 read_bits_lsb :: proc{read_bits_lsb_from_memory, read_bits_lsb_from_stream}

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 read_bits_no_refill_lsb_from_memory :: #force_inline proc(z: ^Context_Memory_Input, width: u8) -> u32 {
 	k := #force_inline peek_bits_no_refill_lsb(z, width)
 	#force_inline consume_bits_lsb(z, width)
 	return k
 }

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 read_bits_no_refill_lsb_from_stream :: #force_inline proc(z: ^Context_Stream_Input, width: u8) -> u32 {
 	k := peek_bits_no_refill_lsb(z, width)
 	consume_bits_lsb(z, width)
@@ -493,14 +493,14 @@ read_bits_no_refill_lsb_from_stream :: #force_inline proc(z: ^Context_Stream_Inp
 read_bits_no_refill_lsb :: proc{read_bits_no_refill_lsb_from_memory, read_bits_no_refill_lsb_from_stream}


-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 discard_to_next_byte_lsb_from_memory :: proc(z: ^Context_Memory_Input) {
 	discard := u8(z.num_bits & 7)
 	#force_inline consume_bits_lsb(z, discard)
 }


-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 discard_to_next_byte_lsb_from_stream :: proc(z: ^Context_Stream_Input) {
 	discard := u8(z.num_bits & 7)
 	consume_bits_lsb(z, discard)
@@ -12,6 +12,7 @@ package compress_zlib

 import "core:compress"

+import "base:intrinsics"
 import "core:mem"
 import "core:io"
 import "core:hash"
@@ -120,23 +121,17 @@ Huffman_Table :: struct {
 }

 // Implementation starts here
-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 z_bit_reverse :: #force_inline proc(n: u16, bits: u8) -> (r: u16) {
 	assert(bits <= 16)
-	// NOTE: Can optimize with llvm.bitreverse.i64 or some bit twiddling
-	// by reversing all of the bits and masking out the unneeded ones.
-	r = n
-	r = ((r & 0xAAAA) >>  1) | ((r & 0x5555) << 1)
-	r = ((r & 0xCCCC) >>  2) | ((r & 0x3333) << 2)
-	r = ((r & 0xF0F0) >>  4) | ((r & 0x0F0F) << 4)
-	r = ((r & 0xFF00) >>  8) | ((r & 0x00FF) << 8)
+	r = intrinsics.reverse_bits(n)

 	r >>= (16 - bits)
 	return
 }


-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 grow_buffer :: proc(buf: ^[dynamic]u8) -> (err: compress.Error) {
 	/*
 		That we get here at all means that we didn't pass an expected output size,
@@ -154,7 +149,7 @@ grow_buffer :: proc(buf: ^[dynamic]u8) -> (err: compress.Error) {
 	TODO: Make these return compress.Error.
 */

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 write_byte :: #force_inline proc(z: ^$C, c: u8) -> (err: io.Error) #no_bounds_check {
 	/*
 		Resize if needed.
@@ -173,7 +168,7 @@ write_byte :: #force_inline proc(z: ^$C, c: u8) -> (err: io.Error) #no_bounds_ch
 	return .None
 }

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 repl_byte :: proc(z: ^$C, count: u16, c: u8) -> (err: io.Error) #no_bounds_check {
 	/*
 		TODO(Jeroen): Once we have a magic ring buffer, we can just peek/write into it
@@ -201,7 +196,7 @@ repl_byte :: proc(z: ^$C, count: u16, c: u8) -> (err: io.Error) #no_bounds_check
 	return .None
 }

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 repl_bytes :: proc(z: ^$C, count: u16, distance: u16) -> (err: io.Error) {
 	/*
 		TODO(Jeroen): Once we have a magic ring buffer, we can just peek/write into it
@@ -234,8 +229,8 @@ allocate_huffman_table :: proc(allocator := context.allocator) -> (z: ^Huffman_T
 	return new(Huffman_Table, allocator), nil
 }

-@(optimization_mode="speed")
-build_huffman :: proc(z: ^Huffman_Table, code_lengths: []u8) -> (err: Error) {
+@(optimization_mode="favor_size")
+build_huffman :: #force_no_inline proc(z: ^Huffman_Table, code_lengths: []u8) -> (err: Error) {
 	sizes:     [HUFFMAN_MAX_BITS+1]int
 	next_code: [HUFFMAN_MAX_BITS+1]int

@@ -293,7 +288,7 @@ build_huffman :: proc(z: ^Huffman_Table, code_lengths: []u8) -> (err: Error) {
 	return nil
 }

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 decode_huffman_slowpath :: proc(z: ^$C, t: ^Huffman_Table) -> (r: u16, err: Error) #no_bounds_check {
 	code := u16(compress.peek_bits_lsb(z,16))

@@ -324,7 +319,7 @@ decode_huffman_slowpath :: proc(z: ^$C, t: ^Huffman_Table) -> (r: u16, err: Erro
 	return r, nil
 }

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 decode_huffman :: proc(z: ^$C, t: ^Huffman_Table) -> (r: u16, err: Error) #no_bounds_check {
 	if z.num_bits < 16 {
 		if z.num_bits > 63 {
@@ -344,7 +339,7 @@ decode_huffman :: proc(z: ^$C, t: ^Huffman_Table) -> (r: u16, err: Error) #no_bo
 	return decode_huffman_slowpath(z, t)
 }

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 parse_huffman_block :: proc(z: ^$C, z_repeat, z_offset: ^Huffman_Table) -> (err: Error) #no_bounds_check {
 	#no_bounds_check for {
 		value, e := decode_huffman(z, z_repeat)
@@ -413,7 +408,7 @@ parse_huffman_block :: proc(z: ^$C, z_repeat, z_offset: ^Huffman_Table) -> (err:
 	}
 }

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 inflate_from_context :: proc(using ctx: ^compress.Context_Memory_Input, raw := false, expected_output_size := -1, allocator := context.allocator) -> (err: Error) #no_bounds_check {
 	/*
 		ctx.output must be a bytes.Buffer for now. We'll add a separate implementation that writes to a stream.
@@ -486,7 +481,7 @@ inflate_from_context :: proc(using ctx: ^compress.Context_Memory_Input, raw := f

 // TODO: Check alignment of reserve/resize.

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 inflate_raw :: proc(z: ^$C, expected_output_size := -1, allocator := context.allocator) -> (err: Error) #no_bounds_check {
 	context.allocator = allocator
 	expected_output_size := expected_output_size
@@ -670,4 +665,4 @@ inflate_from_byte_array_raw :: proc(input: []u8, buf: ^bytes.Buffer, raw := fals
 	return inflate_raw(&ctx, expected_output_size=expected_output_size)
 }

-inflate :: proc{inflate_from_context, inflate_from_byte_array}
+inflate :: proc{inflate_from_context, inflate_from_byte_array}
@@ -0,0 +1,46 @@
+/*
+Package list implements an intrusive doubly-linked list.
+
+An intrusive container requires a `Node` to be embedded in your own structure, like this:
+
+	My_String :: struct {
+		node:  list.Node,
+		value: string,
+	}
+
+Embedding the members of a `list.Node` in your structure with the `using` keyword is also allowed:
+
+	My_String :: struct {
+		using node: list.Node,
+		value: string,
+	}
+
+Here is a full example:
+
+	package test
+	
+	import "core:fmt"
+	import "core:container/intrusive/list"
+	
+	main :: proc() {
+	    l: list.List
+	
+	    one := My_String{value="Hello"}
+	    two := My_String{value="World"}
+	
+	    list.push_back(&l, &one.node)
+	    list.push_back(&l, &two.node)
+	
+	    iter := list.iterator_head(l, My_String, "node")
+	    for s in list.iterate_next(&iter) {
+	        fmt.println(s.value)
+	    }
+	}
+	
+	My_String :: struct {
+	    node:  list.Node,
+	    value: string,
+	}
+
+*/
+package container_intrusive_list
@@ -18,11 +18,18 @@ List :: struct {
 	tail: ^Node,
 }

-
+// The list link you must include in your own structure.
 Node :: struct {
 	prev, next: ^Node,
 }

+/*
+Inserts a new element at the front of the list with O(1) time complexity.
+
+**Inputs**
+- list: The container list
+- node: The node member of the user-defined element structure
+*/
 push_front :: proc "contextless" (list: ^List, node: ^Node) {
 	if list.head != nil {
 		list.head.prev = node
@@ -33,7 +40,13 @@ push_front :: proc "contextless" (list: ^List, node: ^Node) {
 		node.prev, node.next = nil, nil
 	}
 }
+/*
+Inserts a new element at the back of the list with O(1) time complexity.

+**Inputs**
+- list: The container list
+- node: The node member of the user-defined element structure
+*/
 push_back :: proc "contextless" (list: ^List, node: ^Node) {
 	if list.tail != nil {
 		list.tail.next = node
@@ -45,6 +58,13 @@ push_back :: proc "contextless" (list: ^List, node: ^Node) {
 	}
 }

+/*
+Removes an element from a list with O(1) time complexity.
+
+**Inputs**
+- list: The container list
+- node: The node member of the user-defined element structure to be removed
+*/
 remove :: proc "contextless" (list: ^List, node: ^Node) {
 	if node != nil {
 		if node.next != nil {
@@ -61,7 +81,13 @@ remove :: proc "contextless" (list: ^List, node: ^Node) {
 		}
 	}
 }
+/*
+Removes from the given list all elements that satisfy a condition with O(N) time complexity.

+**Inputs**
+- list: The container list
+- to_erase: The condition procedure. It should return `true` if a node should be removed, `false` otherwise
+*/
 remove_by_proc :: proc(list: ^List, to_erase: proc(^Node) -> bool) {
 	for node := list.head; node != nil; {
 		next := node.next
@@ -82,7 +108,13 @@ remove_by_proc :: proc(list: ^List, to_erase: proc(^Node) -> bool) {
 		node = next
 	}
 }
+/*
+Removes from the given list all elements that satisfy a condition with O(N) time complexity.

+**Inputs**
+- list: The container list
+- to_erase: The _contextless_ condition procedure. It should return `true` if a node should be removed, `false` otherwise
+*/
 remove_by_proc_contextless :: proc(list: ^List, to_erase: proc "contextless" (^Node) -> bool) {
 	for node := list.head; node != nil; {
 		next := node.next
@@ -104,12 +136,26 @@ remove_by_proc_contextless :: proc(list: ^List, to_erase: proc "contextless" (^N
 	}
 }

+/*
+Checks whether the given list does not contain any element.

+**Inputs**
+- list: The container list

+**Returns** `true` if `list` is empty, `false` otherwise
+*/
 is_empty :: proc "contextless" (list: ^List) -> bool {
 	return list.head == nil
 }

+/*
+Removes and returns the element at the front of the list with O(1) time complexity.
+
+**Inputs**
+- list: The container list
+
+**Returns** The node member of the user-defined element structure, or `nil` if the list is empty
+*/
 pop_front :: proc "contextless" (list: ^List) -> ^Node {
 	link := list.head
 	if link == nil {
@@ -130,6 +176,14 @@ pop_front :: proc "contextless" (list: ^List) -> ^Node {
 	return link

 }
+/*
+Removes and returns the element at the back of the list with O(1) time complexity.
+
+**Inputs**
+- list: The container list
+
+**Returns** The node member of the user-defined element structure, or `nil` if the list is empty
+*/
 pop_back :: proc "contextless" (list: ^List) -> ^Node {
 	link := list.tail
 	if link == nil {
@@ -151,29 +205,102 @@ pop_back :: proc "contextless" (list: ^List) -> ^Node {
 }


+
 Iterator :: struct($T: typeid) {
 	curr:   ^Node,
 	offset: uintptr,
 }

+/*
+Creates an iterator pointing at the head of the given list. For an example, see `iterate_next`.
+
+**Inputs**
+- list: The container list
+- T: The type of the list's elements
+- field_name: The name of the node field in the `T` structure
+
+**Returns** An iterator pointing at the head of `list`
+
+*/
 iterator_head :: proc "contextless" (list: List, $T: typeid, $field_name: string) -> Iterator(T)
 	where intrinsics.type_has_field(T, field_name),
 	      intrinsics.type_field_type(T, field_name) == Node {
 	return {list.head, offset_of_by_string(T, field_name)}
 }
+/*
+Creates an iterator pointing at the tail of the given list. For an example, see `iterate_prev`.

+**Inputs**
+- list: The container list
+- T: The type of the list's elements
+- field_name: The name of the node field in the `T` structure
+
+**Returns** An iterator pointing at the tail of `list`
+
+*/
 iterator_tail :: proc "contextless" (list: List, $T: typeid, $field_name: string) -> Iterator(T)
 	where intrinsics.type_has_field(T, field_name),
 	      intrinsics.type_field_type(T, field_name) == Node {
 	return {list.tail, offset_of_by_string(T, field_name)}
 }
+/*
+Creates an iterator pointing at the specified node of a list.

+**Inputs**
+- node: a list node
+- T: The type of the list's elements
+- field_name: The name of the node field in the `T` structure
+
+**Returns** An iterator pointing at `node`
+
+*/
 iterator_from_node :: proc "contextless" (node: ^Node, $T: typeid, $field_name: string) -> Iterator(T)
 	where intrinsics.type_has_field(T, field_name),
 	      intrinsics.type_field_type(T, field_name) == Node {
 	return {node, offset_of_by_string(T, field_name)}
 }

+/*
+Retrieves the next element in a list and advances the iterator.
+
+**Inputs**  
+- it: The iterator
+
+**Returns**
+- ptr: The next list element
+- ok: `true` if the element is valid (the iterator could advance), `false` otherwise
+
+Example:
+
+	import "core:fmt"
+	import "core:container/intrusive/list"
+
+	iterate_next_example :: proc() {
+		l: list.List
+
+		one := My_Struct{value=1}
+		two := My_Struct{value=2}
+
+		list.push_back(&l, &one.node)
+		list.push_back(&l, &two.node)
+
+		it := list.iterator_head(l, My_Struct, "node")
+		for num in list.iterate_next(&it) {
+			fmt.println(num.value)
+		}
+	}
+
+	My_Struct :: struct {
+		node : list.Node,
+		value: int,
+	}
+
+Output:
+
+	1
+	2
+
+*/
 iterate_next :: proc "contextless" (it: ^Iterator($T)) -> (ptr: ^T, ok: bool) {
 	node := it.curr
 	if node == nil {
@@ -183,7 +310,47 @@ iterate_next :: proc "contextless" (it: ^Iterator($T)) -> (ptr: ^T, ok: bool) {

 	return (^T)(uintptr(node) - it.offset), true
 }
+/*
+Retrieves the previous element in a list and recede the iterator.

+**Inputs**  
+- it: The iterator
+
+**Returns**
+- ptr: The previous list element
+- ok: `true` if the element is valid (the iterator could recede), `false` otherwise
+
+Example:
+
+	import "core:fmt"
+	import "core:container/intrusive/list"
+
+	iterate_next_example :: proc() {
+		l: list.List
+
+		one := My_Struct{value=1}
+		two := My_Struct{value=2}
+
+		list.push_back(&l, &one.node)
+		list.push_back(&l, &two.node)
+
+		it := list.iterator_tail(l, My_Struct, "node")
+		for num in list.iterate_prev(&it) {
+			fmt.println(num.value)
+		}
+	}
+
+	My_Struct :: struct {
+		node : list.Node,
+		value: int,
+	}
+
+Output:
+
+	2
+	1
+
+*/
 iterate_prev :: proc "contextless" (it: ^Iterator($T)) -> (ptr: ^T, ok: bool) {
 	node := it.curr
 	if node == nil {
@@ -192,4 +359,4 @@ iterate_prev :: proc "contextless" (it: ^Iterator($T)) -> (ptr: ^T, ok: bool) {
 	it.curr = node.prev

 	return (^T)(uintptr(node) - it.offset), true
-}
+}
@@ -95,11 +95,11 @@ front_ptr :: proc(q: ^$Q/Queue($T)) -> ^T {
 }

 back :: proc(q: ^$Q/Queue($T)) -> T {
-	idx := (q.offset+uint(q.len))%builtin.len(q.data)
+	idx := (q.offset+uint(q.len - 1))%builtin.len(q.data)
 	return q.data[idx]
 }
 back_ptr :: proc(q: ^$Q/Queue($T)) -> ^T {
-	idx := (q.offset+uint(q.len))%builtin.len(q.data)
+	idx := (q.offset+uint(q.len - 1))%builtin.len(q.data)
 	return &q.data[idx]
 }

@@ -7,9 +7,8 @@ STRIDE :: 4

 // Context is a keyed AES (ECB) instance.
 Context :: struct {
-	_sk_exp:         [120]u64,
-	_num_rounds:     int,
-	_is_initialized: bool,
+	_sk_exp:     [120]u64,
+	_num_rounds: int,
 }

 // init initializes a context for AES with the provided key.
@@ -18,13 +17,10 @@ init :: proc(ctx: ^Context, key: []byte) {

 	ctx._num_rounds = keysched(skey[:], key)
 	skey_expand(ctx._sk_exp[:], skey[:], ctx._num_rounds)
-	ctx._is_initialized = true
 }

 // encrypt_block sets `dst` to `AES-ECB-Encrypt(src)`.
 encrypt_block :: proc(ctx: ^Context, dst, src: []byte) {
-	assert(ctx._is_initialized)
-
 	q: [8]u64
 	load_blockx1(&q, src)
 	_encrypt(&q, ctx._sk_exp[:], ctx._num_rounds)
@@ -33,8 +29,6 @@ encrypt_block :: proc(ctx: ^Context, dst, src: []byte) {

 // encrypt_block sets `dst` to `AES-ECB-Decrypt(src)`.
 decrypt_block :: proc(ctx: ^Context, dst, src: []byte) {
-	assert(ctx._is_initialized)
-
 	q: [8]u64
 	load_blockx1(&q, src)
 	_decrypt(&q, ctx._sk_exp[:], ctx._num_rounds)
@@ -43,8 +37,6 @@ decrypt_block :: proc(ctx: ^Context, dst, src: []byte) {

 // encrypt_blocks sets `dst` to `AES-ECB-Encrypt(src[0], .. src[n])`.
 encrypt_blocks :: proc(ctx: ^Context, dst, src: [][]byte) {
-	assert(ctx._is_initialized)
-
 	q: [8]u64 = ---
 	src, dst := src, dst

@@ -67,8 +59,6 @@ encrypt_blocks :: proc(ctx: ^Context, dst, src: [][]byte) {

 // decrypt_blocks sets dst to `AES-ECB-Decrypt(src[0], .. src[n])`.
 decrypt_blocks :: proc(ctx: ^Context, dst, src: [][]byte) {
-	assert(ctx._is_initialized)
-
 	q: [8]u64 = ---
 	src, dst := src, dst

@@ -80,8 +80,8 @@ ghash :: proc "contextless" (dst, key, data: []byte) {
 	h2 := h0 ~ h1
 	h2r := h0r ~ h1r

-	src: []byte
 	for l > 0 {
+		src: []byte = ---
 		if l >= _aes.GHASH_BLOCK_SIZE {
 			src = buf
 			buf = buf[_aes.GHASH_BLOCK_SIZE:]
@@ -0,0 +1,43 @@
+//+build amd64
+package aes_hw_intel
+
+import "core:sys/info"
+
+// is_supported returns true iff hardware accelerated AES
+// is supported.
+is_supported :: proc "contextless" () -> bool {
+	features, ok := info.cpu_features.?
+	if !ok {
+		return false
+	}
+
+	// Note: Everything with AES-NI and PCLMULQDQ has support for
+	// the required SSE extxtensions.
+	req_features :: info.CPU_Features{
+		.sse2,
+		.ssse3,
+		.sse41,
+		.aes,
+		.pclmulqdq,
+	}
+	return features >= req_features
+}
+
+// Context is a keyed AES (ECB) instance.
+Context :: struct {
+	// Note: The ideal thing to do is for the expanded round keys to be
+	// arrays of `__m128i`, however that implies alignment (or using AVX).
+	//
+	// All the people using e-waste processors that don't support an
+	// insturction set that has been around for over 10 years are why
+	// we can't have nice things.
+	_sk_exp_enc: [15][16]byte,
+	_sk_exp_dec: [15][16]byte,
+	_num_rounds: int,
+}
+
+// init initializes a context for AES with the provided key.
+init :: proc(ctx: ^Context, key: []byte) {
+	keysched(ctx, key)
+}
+
@@ -0,0 +1,277 @@
+// Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+//   1. Redistributions of source code must retain the above copyright
+//      notice, this list of conditions and the following disclaimer.
+//
+// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR
+// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
+// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//+build amd64
+package aes_hw_intel
+
+import "base:intrinsics"
+import "core:crypto/_aes"
+import "core:simd/x86"
+
+@(private = "file")
+GHASH_STRIDE_HW :: 4
+@(private = "file")
+GHASH_STRIDE_BYTES_HW :: GHASH_STRIDE_HW * _aes.GHASH_BLOCK_SIZE
+
+// GHASH is defined over elements of GF(2^128) with "full little-endian"
+// representation: leftmost byte is least significant, and, within each
+// byte, leftmost _bit_ is least significant. The natural ordering in
+// x86 is "mixed little-endian": bytes are ordered from least to most
+// significant, but bits within a byte are in most-to-least significant
+// order. Going to full little-endian representation would require
+// reversing bits within each byte, which is doable but expensive.
+//
+// Instead, we go to full big-endian representation, by swapping bytes
+// around, which is done with a single _mm_shuffle_epi8() opcode (it
+// comes with SSSE3; all CPU that offer pclmulqdq also have SSSE3). We
+// can use a full big-endian representation because in a carryless
+// multiplication, we have a nice bit reversal property:
+//
+// rev_128(x) * rev_128(y) = rev_255(x * y)
+//
+// So by using full big-endian, we still get the right result, except
+// that it is right-shifted by 1 bit. The left-shift is relatively
+// inexpensive, and it can be mutualised.
+//
+// Since SSE2 opcodes do not have facilities for shitfting full 128-bit
+// values with bit precision, we have to break down values into 64-bit
+// chunks. We number chunks from 0 to 3 in left to right order.
+
+@(private = "file")
+_BYTESWAP_INDEX: x86.__m128i : { 0x08090a0b0c0d0e0f, 0x0001020304050607 }
+
+@(private = "file", require_results, enable_target_feature = "sse2,ssse3")
+byteswap :: #force_inline proc "contextless" (x: x86.__m128i) -> x86.__m128i {
+	return x86._mm_shuffle_epi8(x, _BYTESWAP_INDEX)
+}
+
+// From a 128-bit value kw, compute kx as the XOR of the two 64-bit
+// halves of kw (into the right half of kx; left half is unspecified),
+// and return kx.
+@(private = "file", require_results, enable_target_feature = "sse2")
+bk :: #force_inline proc "contextless" (kw: x86.__m128i) -> x86.__m128i {
+	return x86._mm_xor_si128(kw, x86._mm_shuffle_epi32(kw, 0x0e))
+}
+
+// Combine two 64-bit values (k0:k1) into a 128-bit (kw) value and
+// the XOR of the two values (kx), and return (kw, kx).
+@(private = "file", enable_target_feature = "sse2")
+pbk :: #force_inline proc "contextless" (k0, k1: x86.__m128i) -> (x86.__m128i, x86.__m128i) {
+	kw := x86._mm_unpacklo_epi64(k1, k0)
+	kx := x86._mm_xor_si128(k0, k1)
+	return kw, kx
+}
+
+// Left-shift by 1 bit a 256-bit value (in four 64-bit words).
+@(private = "file", require_results, enable_target_feature = "sse2")
+sl_256 :: #force_inline proc "contextless" (x0, x1, x2, x3: x86.__m128i) -> (x86.__m128i, x86.__m128i, x86.__m128i, x86.__m128i) {
+	x0, x1, x2, x3 := x0, x1, x2, x3
+
+	x0 = x86._mm_or_si128(x86._mm_slli_epi64(x0, 1), x86._mm_srli_epi64(x1, 63))
+	x1 = x86._mm_or_si128(x86._mm_slli_epi64(x1, 1), x86._mm_srli_epi64(x2, 63))
+	x2 = x86._mm_or_si128(x86._mm_slli_epi64(x2, 1), x86._mm_srli_epi64(x3, 63))
+	x3 = x86._mm_slli_epi64(x3, 1)
+
+	return x0, x1, x2, x3
+}
+
+// Perform reduction in GF(2^128).
+@(private = "file", require_results, enable_target_feature = "sse2")
+reduce_f128 :: #force_inline proc "contextless" (x0, x1, x2, x3: x86.__m128i) -> (x86.__m128i, x86.__m128i) {
+	x0, x1, x2 := x0, x1, x2
+
+	x1 = x86._mm_xor_si128(
+		x1,
+		x86._mm_xor_si128(
+			x86._mm_xor_si128(
+				x3,
+				x86._mm_srli_epi64(x3, 1)),
+			x86._mm_xor_si128(
+				x86._mm_srli_epi64(x3, 2),
+				x86._mm_srli_epi64(x3, 7))))
+	x2 = x86._mm_xor_si128(
+		x86._mm_xor_si128(
+			x2,
+			x86._mm_slli_epi64(x3, 63)),
+		x86._mm_xor_si128(
+			x86._mm_slli_epi64(x3, 62),
+			x86._mm_slli_epi64(x3, 57)))
+	x0 = x86._mm_xor_si128(
+		x0,
+		x86._mm_xor_si128(
+			x86._mm_xor_si128(
+				x2,
+				x86._mm_srli_epi64(x2, 1)),
+			x86._mm_xor_si128(
+				x86._mm_srli_epi64(x2, 2),
+				x86._mm_srli_epi64(x2, 7))))
+	x1 = x86._mm_xor_si128(
+		x86._mm_xor_si128(
+			x1,
+			x86._mm_slli_epi64(x2, 63)),
+		x86._mm_xor_si128(
+			x86._mm_slli_epi64(x2, 62),
+			x86._mm_slli_epi64(x2, 57)))
+
+	return x0, x1
+}
+
+// Square value kw in GF(2^128) into (dw,dx).
+@(private = "file", require_results, enable_target_feature = "sse2,pclmul")
+square_f128 :: #force_inline proc "contextless" (kw: x86.__m128i) -> (x86.__m128i, x86.__m128i) {
+	z1 := x86._mm_clmulepi64_si128(kw, kw, 0x11)
+	z3 := x86._mm_clmulepi64_si128(kw, kw, 0x00)
+	z0 := x86._mm_shuffle_epi32(z1, 0x0E)
+	z2 := x86._mm_shuffle_epi32(z3, 0x0E)
+	z0, z1, z2, z3 = sl_256(z0, z1, z2, z3)
+	z0, z1 = reduce_f128(z0, z1, z2, z3)
+	return pbk(z0, z1)
+}
+
+// ghash calculates the GHASH of data, with the key `key`, and input `dst`
+// and `data`, and stores the resulting digest in `dst`.
+//
+// Note: `dst` is both an input and an output, to support easy implementation
+// of GCM.
+@(enable_target_feature = "sse2,ssse3,pclmul")
+ghash :: proc "contextless" (dst, key, data: []byte) #no_bounds_check {
+	if len(dst) != _aes.GHASH_BLOCK_SIZE || len(key) != _aes.GHASH_BLOCK_SIZE {
+		intrinsics.trap()
+	}
+
+	// Note: BearSSL opts to copy the remainder into a zero-filled
+	// 64-byte buffer.  We do something slightly more simple.
+
+	// Load key and dst (h and y).
+	yw := intrinsics.unaligned_load((^x86.__m128i)(raw_data(dst)))
+	h1w := intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
+	yw = byteswap(yw)
+	h1w = byteswap(h1w)
+	h1x := bk(h1w)
+
+	// Process 4 blocks at a time
+	buf := data
+	l := len(buf)
+	if l >= GHASH_STRIDE_BYTES_HW {
+		// Compute h2 = h^2
+		h2w, h2x := square_f128(h1w)
+
+		// Compute h3 = h^3 = h*(h^2)
+		t1 := x86._mm_clmulepi64_si128(h1w, h2w, 0x11)
+		t3 := x86._mm_clmulepi64_si128(h1w, h2w, 0x00)
+		t2 := x86._mm_xor_si128(
+			x86._mm_clmulepi64_si128(h1x, h2x, 0x00),
+			x86._mm_xor_si128(t1, t3))
+		t0 := x86._mm_shuffle_epi32(t1, 0x0E)
+		t1 = x86._mm_xor_si128(t1, x86._mm_shuffle_epi32(t2, 0x0E))
+		t2 = x86._mm_xor_si128(t2, x86._mm_shuffle_epi32(t3, 0x0E))
+		t0, t1, t2, t3 = sl_256(t0, t1, t2, t3)
+		t0, t1 = reduce_f128(t0, t1, t2, t3)
+		h3w, h3x := pbk(t0, t1)
+
+		// Compute h4 = h^4 = (h^2)^2
+		h4w, h4x := square_f128(h2w)
+
+		for l >= GHASH_STRIDE_BYTES_HW {
+			aw0 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(buf)))
+			aw1 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(buf[16:])))
+			aw2 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(buf[32:])))
+			aw3 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(buf[48:])))
+			aw0 = byteswap(aw0)
+			aw1 = byteswap(aw1)
+			aw2 = byteswap(aw2)
+			aw3 = byteswap(aw3)
+			buf, l = buf[GHASH_STRIDE_BYTES_HW:], l - GHASH_STRIDE_BYTES_HW
+
+			aw0 = x86._mm_xor_si128(aw0, yw)
+			ax1 := bk(aw1)
+			ax2 := bk(aw2)
+			ax3 := bk(aw3)
+			ax0 := bk(aw0)
+
+			t1 = x86._mm_xor_si128(
+				x86._mm_xor_si128(
+					x86._mm_clmulepi64_si128(aw0, h4w, 0x11),
+					x86._mm_clmulepi64_si128(aw1, h3w, 0x11)),
+				x86._mm_xor_si128(
+					x86._mm_clmulepi64_si128(aw2, h2w, 0x11),
+					x86._mm_clmulepi64_si128(aw3, h1w, 0x11)))
+			t3 = x86._mm_xor_si128(
+				x86._mm_xor_si128(
+					x86._mm_clmulepi64_si128(aw0, h4w, 0x00),
+					x86._mm_clmulepi64_si128(aw1, h3w, 0x00)),
+				x86._mm_xor_si128(
+					x86._mm_clmulepi64_si128(aw2, h2w, 0x00),
+					x86._mm_clmulepi64_si128(aw3, h1w, 0x00)))
+			t2 = x86._mm_xor_si128(
+				x86._mm_xor_si128(
+					x86._mm_clmulepi64_si128(ax0, h4x, 0x00),
+					x86._mm_clmulepi64_si128(ax1, h3x, 0x00)),
+				x86._mm_xor_si128(
+					x86._mm_clmulepi64_si128(ax2, h2x, 0x00),
+					x86._mm_clmulepi64_si128(ax3, h1x, 0x00)))
+			t2 = x86._mm_xor_si128(t2, x86._mm_xor_si128(t1, t3))
+			t0 = x86._mm_shuffle_epi32(t1, 0x0E)
+			t1 = x86._mm_xor_si128(t1, x86._mm_shuffle_epi32(t2, 0x0E))
+			t2 = x86._mm_xor_si128(t2, x86._mm_shuffle_epi32(t3, 0x0E))
+			t0, t1, t2, t3 = sl_256(t0, t1, t2, t3)
+			t0, t1 = reduce_f128(t0, t1, t2, t3)
+			yw = x86._mm_unpacklo_epi64(t1, t0)
+		}
+	}
+
+	// Process 1 block at a time
+	for l > 0 {
+		src: []byte = ---
+		if l >= _aes.GHASH_BLOCK_SIZE {
+			src = buf
+			buf = buf[_aes.GHASH_BLOCK_SIZE:]
+			l -= _aes.GHASH_BLOCK_SIZE
+		} else {
+			tmp: [_aes.GHASH_BLOCK_SIZE]byte
+			copy(tmp[:], buf)
+			src = tmp[:]
+			l = 0
+		}
+
+		aw := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src)))
+		aw = byteswap(aw)
+
+		aw = x86._mm_xor_si128(aw, yw)
+		ax := bk(aw)
+
+		t1 := x86._mm_clmulepi64_si128(aw, h1w, 0x11)
+		t3 := x86._mm_clmulepi64_si128(aw, h1w, 0x00)
+		t2 := x86._mm_clmulepi64_si128(ax, h1x, 0x00)
+		t2 = x86._mm_xor_si128(t2, x86._mm_xor_si128(t1, t3))
+		t0 := x86._mm_shuffle_epi32(t1, 0x0E)
+		t1 = x86._mm_xor_si128(t1, x86._mm_shuffle_epi32(t2, 0x0E))
+		t2 = x86._mm_xor_si128(t2, x86._mm_shuffle_epi32(t3, 0x0E))
+		t0, t1, t2, t3 = sl_256(t0, t1, t2, t3)
+		t0, t1 = reduce_f128(t0, t1, t2, t3)
+		yw = x86._mm_unpacklo_epi64(t1, t0)
+	}
+
+	// Write back the hash (dst, aka y)
+	yw = byteswap(yw)
+	intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), yw)
+}
@@ -0,0 +1,178 @@
+// Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+//   1. Redistributions of source code must retain the above copyright
+//      notice, this list of conditions and the following disclaimer.
+//
+// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR
+// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
+// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//+build amd64
+package aes_hw_intel
+
+import "base:intrinsics"
+import "core:crypto/_aes"
+import "core:mem"
+import "core:simd/x86"
+
+// Intel AES-NI based implementation.  Inspiration taken from BearSSL.
+//
+// Note: This assumes that the SROA optimization pass is enabled to be
+// anything resembling performat otherwise, LLVM will not elide a massive
+// number of redundant loads/stores it generates for every intrinsic call.
+
+@(private = "file", require_results, enable_target_feature = "sse2")
+expand_step128 :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i {
+	k1, k2 := k1, k2
+
+	k2 = x86._mm_shuffle_epi32(k2, 0xff)
+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
+	return x86._mm_xor_si128(k1, k2)
+}
+
+@(private = "file", require_results, enable_target_feature = "sse,sse2")
+expand_step192a :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> (x86.__m128i, x86.__m128i) {
+	k1, k2, k3 := k1_^, k2_^, k3
+
+	k3 = x86._mm_shuffle_epi32(k3, 0x55)
+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
+	k1 = x86._mm_xor_si128(k1, k3)
+
+	tmp := k2
+	k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04))
+	k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff))
+
+	k1_, k2_ := k1_, k2_
+	k1_^, k2_^ = k1, k2
+
+	r1 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(tmp), transmute(x86.__m128)(k1), 0x44))
+	r2 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(k1), transmute(x86.__m128)(k2), 0x4e))
+
+	return r1, r2
+}
+
+@(private = "file", require_results, enable_target_feature = "sse2")
+expand_step192b :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> x86.__m128i {
+	k1, k2, k3 := k1_^, k2_^, k3
+
+	k3 = x86._mm_shuffle_epi32(k3, 0x55)
+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
+	k1 = x86._mm_xor_si128(k1, k3)
+
+	k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04))
+	k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff))
+
+	k1_, k2_ := k1_, k2_
+	k1_^, k2_^ = k1, k2
+
+	return k1
+}
+
+@(private = "file", require_results, enable_target_feature = "sse2")
+expand_step256b :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i {
+	k1, k2 := k1, k2
+
+	k2 = x86._mm_shuffle_epi32(k2, 0xaa)
+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
+	return x86._mm_xor_si128(k1, k2)
+}
+
+@(private = "file", enable_target_feature = "aes")
+derive_dec_keys :: proc(ctx: ^Context, sks: ^[15]x86.__m128i, num_rounds: int) {
+	intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[0]), sks[num_rounds])
+	for i in 1 ..< num_rounds {
+		tmp := x86._mm_aesimc_si128(sks[i])
+		intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds - i]), tmp)
+	}
+	intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds]), sks[0])
+}
+
+@(private, enable_target_feature = "sse,sse2,aes")
+keysched :: proc(ctx: ^Context, key: []byte) {
+	sks: [15]x86.__m128i = ---
+
+	// Compute the encryption keys.
+	num_rounds, key_len := 0, len(key)
+	switch key_len {
+	case _aes.KEY_SIZE_128:
+		sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
+		sks[1] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[0], 0x01))
+		sks[2] = expand_step128(sks[1], x86._mm_aeskeygenassist_si128(sks[1], 0x02))
+		sks[3] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[2], 0x04))
+		sks[4] = expand_step128(sks[3], x86._mm_aeskeygenassist_si128(sks[3], 0x08))
+		sks[5] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[4], 0x10))
+		sks[6] = expand_step128(sks[5], x86._mm_aeskeygenassist_si128(sks[5], 0x20))
+		sks[7] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[6], 0x40))
+		sks[8] = expand_step128(sks[7], x86._mm_aeskeygenassist_si128(sks[7], 0x80))
+		sks[9] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[8], 0x1b))
+		sks[10] = expand_step128(sks[9], x86._mm_aeskeygenassist_si128(sks[9], 0x36))
+		num_rounds = _aes.ROUNDS_128
+	case _aes.KEY_SIZE_192:
+		k0 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
+		k1 := x86.__m128i{
+			intrinsics.unaligned_load((^i64)(raw_data(key[16:]))),
+			0,
+		}
+		sks[0] = k0
+		sks[1], sks[2] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x01))
+		sks[3] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x02))
+		sks[4], sks[5] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x04))
+		sks[6] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x08))
+		sks[7], sks[8] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x10))
+		sks[9] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x20))
+		sks[10], sks[11] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x40))
+		sks[12] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x80))
+		num_rounds = _aes.ROUNDS_192
+	case _aes.KEY_SIZE_256:
+		sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
+		sks[1] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key[16:])))
+		sks[2] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[1], 0x01))
+		sks[3] = expand_step256b(sks[1], x86._mm_aeskeygenassist_si128(sks[2], 0x01))
+		sks[4] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[3], 0x02))
+		sks[5] = expand_step256b(sks[3], x86._mm_aeskeygenassist_si128(sks[4], 0x02))
+		sks[6] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[5], 0x04))
+		sks[7] = expand_step256b(sks[5], x86._mm_aeskeygenassist_si128(sks[6], 0x04))
+		sks[8] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[7], 0x08))
+		sks[9] = expand_step256b(sks[7], x86._mm_aeskeygenassist_si128(sks[8], 0x08))
+		sks[10] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[9], 0x10))
+		sks[11] = expand_step256b(sks[9], x86._mm_aeskeygenassist_si128(sks[10], 0x10))
+		sks[12] = expand_step128(sks[10], x86._mm_aeskeygenassist_si128(sks[11], 0x20))
+		sks[13] = expand_step256b(sks[11], x86._mm_aeskeygenassist_si128(sks[12], 0x20))
+		sks[14] = expand_step128(sks[12], x86._mm_aeskeygenassist_si128(sks[13], 0x40))
+		num_rounds = _aes.ROUNDS_256
+	case:
+		panic("crypto/aes: invalid AES key size")
+	}
+	for i in 0 ..= num_rounds {
+		intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_enc[i]), sks[i])
+	}
+
+	// Compute the decryption keys.  GCM and CTR do not need this, however
+	// ECB, CBC, OCB3, etc do.
+	derive_dec_keys(ctx, &sks, num_rounds)
+
+	ctx._num_rounds = num_rounds
+
+	mem.zero_explicit(&sks, size_of(sks))
+}
@@ -0,0 +1,123 @@
+package _chacha20
+
+import "base:intrinsics"
+import "core:encoding/endian"
+import "core:math/bits"
+import "core:mem"
+
+// KEY_SIZE is the (X)ChaCha20 key size in bytes.
+KEY_SIZE :: 32
+// IV_SIZE is the ChaCha20 IV size in bytes.
+IV_SIZE :: 12
+// XIV_SIZE is the XChaCha20 IV size in bytes.
+XIV_SIZE :: 24
+
+// MAX_CTR_IETF is the maximum counter value for the IETF flavor ChaCha20.
+MAX_CTR_IETF :: 0xffffffff
+// BLOCK_SIZE is the (X)ChaCha20 block size in bytes.
+BLOCK_SIZE :: 64
+// STATE_SIZE_U32 is the (X)ChaCha20 state size in u32s.
+STATE_SIZE_U32 :: 16
+// Rounds is the (X)ChaCha20 round count.
+ROUNDS :: 20
+
+// SIGMA_0 is sigma[0:4].
+SIGMA_0: u32 : 0x61707865
+// SIGMA_1 is sigma[4:8].
+SIGMA_1: u32 : 0x3320646e
+// SIGMA_2 is sigma[8:12].
+SIGMA_2: u32 : 0x79622d32
+// SIGMA_3 is sigma[12:16].
+SIGMA_3: u32 : 0x6b206574
+
+// Context is a ChaCha20 or XChaCha20 instance.
+Context :: struct {
+	_s:              [STATE_SIZE_U32]u32,
+	_buffer:         [BLOCK_SIZE]byte,
+	_off:            int,
+	_is_ietf_flavor: bool,
+	_is_initialized: bool,
+}
+
+// init inititializes a Context for ChaCha20 with the provided key and
+// iv.
+//
+// WARNING: This ONLY handles ChaCha20.  XChaCha20 sub-key and IV
+// derivation is expected to be handled by the caller, so that the
+// HChaCha call can be suitably accelerated.
+init :: proc "contextless" (ctx: ^Context, key, iv: []byte, is_xchacha: bool) {
+	if len(key) != KEY_SIZE || len(iv) != IV_SIZE {
+		intrinsics.trap()
+	}
+
+	k, n := key, iv
+
+	ctx._s[0] = SIGMA_0
+	ctx._s[1] = SIGMA_1
+	ctx._s[2] = SIGMA_2
+	ctx._s[3] = SIGMA_3
+	ctx._s[4] = endian.unchecked_get_u32le(k[0:4])
+	ctx._s[5] = endian.unchecked_get_u32le(k[4:8])
+	ctx._s[6] = endian.unchecked_get_u32le(k[8:12])
+	ctx._s[7] = endian.unchecked_get_u32le(k[12:16])
+	ctx._s[8] = endian.unchecked_get_u32le(k[16:20])
+	ctx._s[9] = endian.unchecked_get_u32le(k[20:24])
+	ctx._s[10] = endian.unchecked_get_u32le(k[24:28])
+	ctx._s[11] = endian.unchecked_get_u32le(k[28:32])
+	ctx._s[12] = 0
+	ctx._s[13] = endian.unchecked_get_u32le(n[0:4])
+	ctx._s[14] = endian.unchecked_get_u32le(n[4:8])
+	ctx._s[15] = endian.unchecked_get_u32le(n[8:12])
+
+	ctx._off = BLOCK_SIZE
+	ctx._is_ietf_flavor = !is_xchacha
+	ctx._is_initialized = true
+}
+
+// seek seeks the (X)ChaCha20 stream counter to the specified block.
+seek :: proc(ctx: ^Context, block_nr: u64) {
+	assert(ctx._is_initialized)
+
+	if ctx._is_ietf_flavor {
+		if block_nr > MAX_CTR_IETF {
+			panic("crypto/chacha20: attempted to seek past maximum counter")
+		}
+	} else {
+		ctx._s[13] = u32(block_nr >> 32)
+	}
+	ctx._s[12] = u32(block_nr)
+	ctx._off = BLOCK_SIZE
+}
+
+// reset sanitizes the Context.  The Context must be re-initialized to
+// be used again.
+reset :: proc(ctx: ^Context) {
+	mem.zero_explicit(&ctx._s, size_of(ctx._s))
+	mem.zero_explicit(&ctx._buffer, size_of(ctx._buffer))
+
+	ctx._is_initialized = false
+}
+
+check_counter_limit :: proc(ctx: ^Context, nr_blocks: int) {
+	// Enforce the maximum consumed keystream per IV.
+	//
+	// While all modern "standard" definitions of ChaCha20 use
+	// the IETF 32-bit counter, for XChaCha20 most common
+	// implementations allow for a 64-bit counter.
+	//
+	// Honestly, the answer here is "use a MRAE primitive", but
+	// go with "common" practice in the case of XChaCha20.
+
+	ERR_CTR_EXHAUSTED :: "crypto/chacha20: maximum (X)ChaCha20 keystream per IV reached"
+
+	if ctx._is_ietf_flavor {
+		if u64(ctx._s[12]) + u64(nr_blocks) > MAX_CTR_IETF {
+			panic(ERR_CTR_EXHAUSTED)
+		}
+	} else {
+		ctr := (u64(ctx._s[13]) << 32) | u64(ctx._s[12])
+		if _, carry := bits.add_u64(ctr, u64(nr_blocks), 0); carry != 0 {
+			panic(ERR_CTR_EXHAUSTED)
+		}
+	}
+}
@@ -0,0 +1,360 @@
+package chacha20_ref
+
+import "core:crypto/_chacha20"
+import "core:encoding/endian"
+import "core:math/bits"
+
+stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) {
+	// Enforce the maximum consumed keystream per IV.
+	_chacha20.check_counter_limit(ctx, nr_blocks)
+
+	dst, src := dst, src
+	x := &ctx._s
+	for n := 0; n < nr_blocks; n = n + 1 {
+		x0, x1, x2, x3 :=
+			_chacha20.SIGMA_0, _chacha20.SIGMA_1, _chacha20.SIGMA_2, _chacha20.SIGMA_3
+		x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 :=
+			x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
+
+		for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
+			// Even when forcing inlining manually inlining all of
+			// these is decently faster.
+
+			// quarterround(x, 0, 4, 8, 12)
+			x0 += x4
+			x12 ~= x0
+			x12 = bits.rotate_left32(x12, 16)
+			x8 += x12
+			x4 ~= x8
+			x4 = bits.rotate_left32(x4, 12)
+			x0 += x4
+			x12 ~= x0
+			x12 = bits.rotate_left32(x12, 8)
+			x8 += x12
+			x4 ~= x8
+			x4 = bits.rotate_left32(x4, 7)
+
+			// quarterround(x, 1, 5, 9, 13)
+			x1 += x5
+			x13 ~= x1
+			x13 = bits.rotate_left32(x13, 16)
+			x9 += x13
+			x5 ~= x9
+			x5 = bits.rotate_left32(x5, 12)
+			x1 += x5
+			x13 ~= x1
+			x13 = bits.rotate_left32(x13, 8)
+			x9 += x13
+			x5 ~= x9
+			x5 = bits.rotate_left32(x5, 7)
+
+			// quarterround(x, 2, 6, 10, 14)
+			x2 += x6
+			x14 ~= x2
+			x14 = bits.rotate_left32(x14, 16)
+			x10 += x14
+			x6 ~= x10
+			x6 = bits.rotate_left32(x6, 12)
+			x2 += x6
+			x14 ~= x2
+			x14 = bits.rotate_left32(x14, 8)
+			x10 += x14
+			x6 ~= x10
+			x6 = bits.rotate_left32(x6, 7)
+
+			// quarterround(x, 3, 7, 11, 15)
+			x3 += x7
+			x15 ~= x3
+			x15 = bits.rotate_left32(x15, 16)
+			x11 += x15
+			x7 ~= x11
+			x7 = bits.rotate_left32(x7, 12)
+			x3 += x7
+			x15 ~= x3
+			x15 = bits.rotate_left32(x15, 8)
+			x11 += x15
+			x7 ~= x11
+			x7 = bits.rotate_left32(x7, 7)
+
+			// quarterround(x, 0, 5, 10, 15)
+			x0 += x5
+			x15 ~= x0
+			x15 = bits.rotate_left32(x15, 16)
+			x10 += x15
+			x5 ~= x10
+			x5 = bits.rotate_left32(x5, 12)
+			x0 += x5
+			x15 ~= x0
+			x15 = bits.rotate_left32(x15, 8)
+			x10 += x15
+			x5 ~= x10
+			x5 = bits.rotate_left32(x5, 7)
+
+			// quarterround(x, 1, 6, 11, 12)
+			x1 += x6
+			x12 ~= x1
+			x12 = bits.rotate_left32(x12, 16)
+			x11 += x12
+			x6 ~= x11
+			x6 = bits.rotate_left32(x6, 12)
+			x1 += x6
+			x12 ~= x1
+			x12 = bits.rotate_left32(x12, 8)
+			x11 += x12
+			x6 ~= x11
+			x6 = bits.rotate_left32(x6, 7)
+
+			// quarterround(x, 2, 7, 8, 13)
+			x2 += x7
+			x13 ~= x2
+			x13 = bits.rotate_left32(x13, 16)
+			x8 += x13
+			x7 ~= x8
+			x7 = bits.rotate_left32(x7, 12)
+			x2 += x7
+			x13 ~= x2
+			x13 = bits.rotate_left32(x13, 8)
+			x8 += x13
+			x7 ~= x8
+			x7 = bits.rotate_left32(x7, 7)
+
+			// quarterround(x, 3, 4, 9, 14)
+			x3 += x4
+			x14 ~= x3
+			x14 = bits.rotate_left32(x14, 16)
+			x9 += x14
+			x4 ~= x9
+			x4 = bits.rotate_left32(x4, 12)
+			x3 += x4
+			x14 ~= x3
+			x14 = bits.rotate_left32(x14, 8)
+			x9 += x14
+			x4 ~= x9
+			x4 = bits.rotate_left32(x4, 7)
+		}
+
+		x0 += _chacha20.SIGMA_0
+		x1 += _chacha20.SIGMA_1
+		x2 += _chacha20.SIGMA_2
+		x3 += _chacha20.SIGMA_3
+		x4 += x[4]
+		x5 += x[5]
+		x6 += x[6]
+		x7 += x[7]
+		x8 += x[8]
+		x9 += x[9]
+		x10 += x[10]
+		x11 += x[11]
+		x12 += x[12]
+		x13 += x[13]
+		x14 += x[14]
+		x15 += x[15]
+
+		// - The caller(s) ensure that src/dst are valid.
+		// - The compiler knows if the target is picky about alignment.
+
+		#no_bounds_check {
+			if src != nil {
+				endian.unchecked_put_u32le(dst[0:4], endian.unchecked_get_u32le(src[0:4]) ~ x0)
+				endian.unchecked_put_u32le(dst[4:8], endian.unchecked_get_u32le(src[4:8]) ~ x1)
+				endian.unchecked_put_u32le(dst[8:12], endian.unchecked_get_u32le(src[8:12]) ~ x2)
+				endian.unchecked_put_u32le(dst[12:16], endian.unchecked_get_u32le(src[12:16]) ~ x3)
+				endian.unchecked_put_u32le(dst[16:20], endian.unchecked_get_u32le(src[16:20]) ~ x4)
+				endian.unchecked_put_u32le(dst[20:24], endian.unchecked_get_u32le(src[20:24]) ~ x5)
+				endian.unchecked_put_u32le(dst[24:28], endian.unchecked_get_u32le(src[24:28]) ~ x6)
+				endian.unchecked_put_u32le(dst[28:32], endian.unchecked_get_u32le(src[28:32]) ~ x7)
+				endian.unchecked_put_u32le(dst[32:36], endian.unchecked_get_u32le(src[32:36]) ~ x8)
+				endian.unchecked_put_u32le(dst[36:40], endian.unchecked_get_u32le(src[36:40]) ~ x9)
+				endian.unchecked_put_u32le(
+					dst[40:44],
+					endian.unchecked_get_u32le(src[40:44]) ~ x10,
+				)
+				endian.unchecked_put_u32le(
+					dst[44:48],
+					endian.unchecked_get_u32le(src[44:48]) ~ x11,
+				)
+				endian.unchecked_put_u32le(
+					dst[48:52],
+					endian.unchecked_get_u32le(src[48:52]) ~ x12,
+				)
+				endian.unchecked_put_u32le(
+					dst[52:56],
+					endian.unchecked_get_u32le(src[52:56]) ~ x13,
+				)
+				endian.unchecked_put_u32le(
+					dst[56:60],
+					endian.unchecked_get_u32le(src[56:60]) ~ x14,
+				)
+				endian.unchecked_put_u32le(
+					dst[60:64],
+					endian.unchecked_get_u32le(src[60:64]) ~ x15,
+				)
+				src = src[_chacha20.BLOCK_SIZE:]
+			} else {
+				endian.unchecked_put_u32le(dst[0:4], x0)
+				endian.unchecked_put_u32le(dst[4:8], x1)
+				endian.unchecked_put_u32le(dst[8:12], x2)
+				endian.unchecked_put_u32le(dst[12:16], x3)
+				endian.unchecked_put_u32le(dst[16:20], x4)
+				endian.unchecked_put_u32le(dst[20:24], x5)
+				endian.unchecked_put_u32le(dst[24:28], x6)
+				endian.unchecked_put_u32le(dst[28:32], x7)
+				endian.unchecked_put_u32le(dst[32:36], x8)
+				endian.unchecked_put_u32le(dst[36:40], x9)
+				endian.unchecked_put_u32le(dst[40:44], x10)
+				endian.unchecked_put_u32le(dst[44:48], x11)
+				endian.unchecked_put_u32le(dst[48:52], x12)
+				endian.unchecked_put_u32le(dst[52:56], x13)
+				endian.unchecked_put_u32le(dst[56:60], x14)
+				endian.unchecked_put_u32le(dst[60:64], x15)
+			}
+			dst = dst[_chacha20.BLOCK_SIZE:]
+		}
+
+		// Increment the counter.  Overflow checking is done upon
+		// entry into the routine, so a 64-bit increment safely
+		// covers both cases.
+		new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + 1
+		x[12] = u32(new_ctr)
+		x[13] = u32(new_ctr >> 32)
+	}
+}
+
+hchacha20 :: proc "contextless" (dst, key, iv: []byte) {
+	x0, x1, x2, x3 := _chacha20.SIGMA_0, _chacha20.SIGMA_1, _chacha20.SIGMA_2, _chacha20.SIGMA_3
+	x4 := endian.unchecked_get_u32le(key[0:4])
+	x5 := endian.unchecked_get_u32le(key[4:8])
+	x6 := endian.unchecked_get_u32le(key[8:12])
+	x7 := endian.unchecked_get_u32le(key[12:16])
+	x8 := endian.unchecked_get_u32le(key[16:20])
+	x9 := endian.unchecked_get_u32le(key[20:24])
+	x10 := endian.unchecked_get_u32le(key[24:28])
+	x11 := endian.unchecked_get_u32le(key[28:32])
+	x12 := endian.unchecked_get_u32le(iv[0:4])
+	x13 := endian.unchecked_get_u32le(iv[4:8])
+	x14 := endian.unchecked_get_u32le(iv[8:12])
+	x15 := endian.unchecked_get_u32le(iv[12:16])
+
+	for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
+		// quarterround(x, 0, 4, 8, 12)
+		x0 += x4
+		x12 ~= x0
+		x12 = bits.rotate_left32(x12, 16)
+		x8 += x12
+		x4 ~= x8
+		x4 = bits.rotate_left32(x4, 12)
+		x0 += x4
+		x12 ~= x0
+		x12 = bits.rotate_left32(x12, 8)
+		x8 += x12
+		x4 ~= x8
+		x4 = bits.rotate_left32(x4, 7)
+
+		// quarterround(x, 1, 5, 9, 13)
+		x1 += x5
+		x13 ~= x1
+		x13 = bits.rotate_left32(x13, 16)
+		x9 += x13
+		x5 ~= x9
+		x5 = bits.rotate_left32(x5, 12)
+		x1 += x5
+		x13 ~= x1
+		x13 = bits.rotate_left32(x13, 8)
+		x9 += x13
+		x5 ~= x9
+		x5 = bits.rotate_left32(x5, 7)
+
+		// quarterround(x, 2, 6, 10, 14)
+		x2 += x6
+		x14 ~= x2
+		x14 = bits.rotate_left32(x14, 16)
+		x10 += x14
+		x6 ~= x10
+		x6 = bits.rotate_left32(x6, 12)
+		x2 += x6
+		x14 ~= x2
+		x14 = bits.rotate_left32(x14, 8)
+		x10 += x14
+		x6 ~= x10
+		x6 = bits.rotate_left32(x6, 7)
+
+		// quarterround(x, 3, 7, 11, 15)
+		x3 += x7
+		x15 ~= x3
+		x15 = bits.rotate_left32(x15, 16)
+		x11 += x15
+		x7 ~= x11
+		x7 = bits.rotate_left32(x7, 12)
+		x3 += x7
+		x15 ~= x3
+		x15 = bits.rotate_left32(x15, 8)
+		x11 += x15
+		x7 ~= x11
+		x7 = bits.rotate_left32(x7, 7)
+
+		// quarterround(x, 0, 5, 10, 15)
+		x0 += x5
+		x15 ~= x0
+		x15 = bits.rotate_left32(x15, 16)
+		x10 += x15
+		x5 ~= x10
+		x5 = bits.rotate_left32(x5, 12)
+		x0 += x5
+		x15 ~= x0
+		x15 = bits.rotate_left32(x15, 8)
+		x10 += x15
+		x5 ~= x10
+		x5 = bits.rotate_left32(x5, 7)
+
+		// quarterround(x, 1, 6, 11, 12)
+		x1 += x6
+		x12 ~= x1
+		x12 = bits.rotate_left32(x12, 16)
+		x11 += x12
+		x6 ~= x11
+		x6 = bits.rotate_left32(x6, 12)
+		x1 += x6
+		x12 ~= x1
+		x12 = bits.rotate_left32(x12, 8)
+		x11 += x12
+		x6 ~= x11
+		x6 = bits.rotate_left32(x6, 7)
+
+		// quarterround(x, 2, 7, 8, 13)
+		x2 += x7
+		x13 ~= x2
+		x13 = bits.rotate_left32(x13, 16)
+		x8 += x13
+		x7 ~= x8
+		x7 = bits.rotate_left32(x7, 12)
+		x2 += x7
+		x13 ~= x2
+		x13 = bits.rotate_left32(x13, 8)
+		x8 += x13
+		x7 ~= x8
+		x7 = bits.rotate_left32(x7, 7)
+
+		// quarterround(x, 3, 4, 9, 14)
+		x3 += x4
+		x14 ~= x3
+		x14 = bits.rotate_left32(x14, 16)
+		x9 += x14
+		x4 ~= x9
+		x4 = bits.rotate_left32(x4, 12)
+		x3 += x4
+		x14 ~= x3
+		x14 = bits.rotate_left32(x14, 8)
+		x9 += x14
+		x4 ~= x9
+		x4 = bits.rotate_left32(x4, 7)
+	}
+
+	endian.unchecked_put_u32le(dst[0:4], x0)
+	endian.unchecked_put_u32le(dst[4:8], x1)
+	endian.unchecked_put_u32le(dst[8:12], x2)
+	endian.unchecked_put_u32le(dst[12:16], x3)
+	endian.unchecked_put_u32le(dst[16:20], x12)
+	endian.unchecked_put_u32le(dst[20:24], x13)
+	endian.unchecked_put_u32le(dst[24:28], x14)
+	endian.unchecked_put_u32le(dst[28:32], x15)
+}
@@ -0,0 +1,481 @@
+package chacha20_simd128
+
+import "base:intrinsics"
+import "core:crypto/_chacha20"
+import "core:simd"
+@(require) import "core:sys/info"
+
+// Portable 128-bit `core:simd` implementation.
+//
+// This is loosely based on Ted Krovetz's public domain C intrinsic
+// implementation.
+//
+// This is written to perform adequately on any target that has "enough"
+// 128-bit vector registers, the current thought is that 4 blocks at at
+// time is reasonable for amd64, though Ted's code is more conservative.
+//
+// See:
+// supercop-20230530/crypto_stream/chacha20/krovetz/vec128
+
+// Ensure the compiler emits SIMD instructions.  This is a minimum, and
+// setting the microarchitecture at compile time will allow for better
+// code gen when applicable (eg: AVX).  This is somewhat redundant with
+// the default microarchitecture configurations.
+when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
+	@(private = "file")
+	TARGET_SIMD_FEATURES :: "neon"
+} else when ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 {
+	// Note: LLVM appears to be smart enough to use PSHUFB despite not
+	// explicitly using simd.u8x16 shuffles.
+	@(private = "file")
+	TARGET_SIMD_FEATURES :: "sse2,ssse3"
+} else {
+	@(private = "file")
+	TARGET_SIMD_FEATURES :: ""
+}
+
+@(private = "file")
+_ROT_7L: simd.u32x4 : {7, 7, 7, 7}
+@(private = "file")
+_ROT_7R: simd.u32x4 : {25, 25, 25, 25}
+@(private = "file")
+_ROT_12L: simd.u32x4 : {12, 12, 12, 12}
+@(private = "file")
+_ROT_12R: simd.u32x4 : {20, 20, 20, 20}
+@(private = "file")
+_ROT_8L: simd.u32x4 : {8, 8, 8, 8}
+@(private = "file")
+_ROT_8R: simd.u32x4 : {24, 24, 24, 24}
+@(private = "file")
+_ROT_16: simd.u32x4 : {16, 16, 16, 16}
+
+when ODIN_ENDIAN == .Big {
+	@(private = "file")
+	_increment_counter :: #force_inline proc "contextless" (ctx: ^Context) -> simd.u32x4 {
+		// In the Big Endian case, the low and high portions in the vector
+		// are flipped, so the 64-bit addition can't be done with a simple
+		// vector add.
+		x := &ctx._s
+
+		new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + 1
+		x[12] = u32(new_ctr)
+		x[13] = u32(new_ctr >> 32)
+
+		return intrinsics.unaligned_load(transmute(^simd.u32x4)&x[12])
+	}
+
+	// Convert the endian-ness of the components of a u32x4 vector, for
+	// the purposes of output.
+	@(private = "file")
+	_byteswap_u32x4 :: #force_inline proc "contextless" (v: simd.u32x4) -> simd.u32x4 {
+		return(
+			transmute(simd.u32x4)simd.shuffle(
+				transmute(simd.u8x16)v,
+				transmute(simd.u8x16)v,
+				3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
+			)
+		)
+	}
+} else {
+	@(private = "file")
+	_VEC_ONE: simd.u64x2 : {1, 0}
+}
+
+@(private = "file")
+_dq_round_simd128 :: #force_inline proc "contextless" (
+	v0, v1, v2, v3: simd.u32x4,
+) -> (
+	simd.u32x4,
+	simd.u32x4,
+	simd.u32x4,
+	simd.u32x4,
+) {
+	v0, v1, v2, v3 := v0, v1, v2, v3
+
+	// a += b; d ^= a; d = ROTW16(d);
+	v0 = simd.add(v0, v1)
+	v3 = simd.bit_xor(v3, v0)
+	v3 = simd.bit_xor(simd.shl(v3, _ROT_16), simd.shr(v3, _ROT_16))
+
+	// c += d; b ^= c; b = ROTW12(b);
+	v2 = simd.add(v2, v3)
+	v1 = simd.bit_xor(v1, v2)
+	v1 = simd.bit_xor(simd.shl(v1, _ROT_12L), simd.shr(v1, _ROT_12R))
+
+	// a += b; d ^= a; d = ROTW8(d);
+	v0 = simd.add(v0, v1)
+	v3 = simd.bit_xor(v3, v0)
+	v3 = simd.bit_xor(simd.shl(v3, _ROT_8L), simd.shr(v3, _ROT_8R))
+
+	// c += d; b ^= c; b = ROTW7(b);
+	v2 = simd.add(v2, v3)
+	v1 = simd.bit_xor(v1, v2)
+	v1 = simd.bit_xor(simd.shl(v1, _ROT_7L), simd.shr(v1, _ROT_7R))
+
+	// b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
+	v1 = simd.shuffle(v1, v1, 1, 2, 3, 0)
+	v2 = simd.shuffle(v2, v2, 2, 3, 0, 1)
+	v3 = simd.shuffle(v3, v3, 3, 0, 1, 2)
+
+	// a += b; d ^= a; d = ROTW16(d);
+	v0 = simd.add(v0, v1)
+	v3 = simd.bit_xor(v3, v0)
+	v3 = simd.bit_xor(simd.shl(v3, _ROT_16), simd.shr(v3, _ROT_16))
+
+	// c += d; b ^= c; b = ROTW12(b);
+	v2 = simd.add(v2, v3)
+	v1 = simd.bit_xor(v1, v2)
+	v1 = simd.bit_xor(simd.shl(v1, _ROT_12L), simd.shr(v1, _ROT_12R))
+
+	// a += b; d ^= a; d = ROTW8(d);
+	v0 = simd.add(v0, v1)
+	v3 = simd.bit_xor(v3, v0)
+	v3 = simd.bit_xor(simd.shl(v3, _ROT_8L), simd.shr(v3, _ROT_8R))
+
+	// c += d; b ^= c; b = ROTW7(b);
+	v2 = simd.add(v2, v3)
+	v1 = simd.bit_xor(v1, v2)
+	v1 = simd.bit_xor(simd.shl(v1, _ROT_7L), simd.shr(v1, _ROT_7R))
+
+	// b = ROTV3(b); c = ROTV2(c); d = ROTV1(d);
+	v1 = simd.shuffle(v1, v1, 3, 0, 1, 2)
+	v2 = simd.shuffle(v2, v2, 2, 3, 0, 1)
+	v3 = simd.shuffle(v3, v3, 1, 2, 3, 0)
+
+	return v0, v1, v2, v3
+}
+
+@(private = "file")
+_add_state_simd128 :: #force_inline proc "contextless" (
+	v0, v1, v2, v3, s0, s1, s2, s3: simd.u32x4,
+) -> (
+	simd.u32x4,
+	simd.u32x4,
+	simd.u32x4,
+	simd.u32x4,
+) {
+	v0, v1, v2, v3 := v0, v1, v2, v3
+
+	v0 = simd.add(v0, s0)
+	v1 = simd.add(v1, s1)
+	v2 = simd.add(v2, s2)
+	v3 = simd.add(v3, s3)
+
+	when ODIN_ENDIAN == .Big {
+		v0 = _byteswap_u32x4(v0)
+		v1 = _byteswap_u32x4(v1)
+		v2 = _byteswap_u32x4(v2)
+		v3 = _byteswap_u32x4(v3)
+	}
+
+	return v0, v1, v2, v3
+}
+
+@(private = "file")
+_xor_simd128 :: #force_inline proc "contextless" (
+	src: [^]simd.u32x4,
+	v0, v1, v2, v3: simd.u32x4,
+) -> (
+	simd.u32x4,
+	simd.u32x4,
+	simd.u32x4,
+	simd.u32x4,
+) {
+	v0, v1, v2, v3 := v0, v1, v2, v3
+
+	v0 = simd.bit_xor(v0, intrinsics.unaligned_load((^simd.u32x4)(src[0:])))
+	v1 = simd.bit_xor(v1, intrinsics.unaligned_load((^simd.u32x4)(src[1:])))
+	v2 = simd.bit_xor(v2, intrinsics.unaligned_load((^simd.u32x4)(src[2:])))
+	v3 = simd.bit_xor(v3, intrinsics.unaligned_load((^simd.u32x4)(src[3:])))
+
+	return v0, v1, v2, v3
+}
+
+@(private = "file")
+_store_simd128 :: #force_inline proc "contextless" (
+	dst: [^]simd.u32x4,
+	v0, v1, v2, v3: simd.u32x4,
+) {
+	intrinsics.unaligned_store((^simd.u32x4)(dst[0:]), v0)
+	intrinsics.unaligned_store((^simd.u32x4)(dst[1:]), v1)
+	intrinsics.unaligned_store((^simd.u32x4)(dst[2:]), v2)
+	intrinsics.unaligned_store((^simd.u32x4)(dst[3:]), v3)
+}
+
+// is_performant returns true iff the target and current host both support
+// "enough" 128-bit SIMD to make this implementation performant.
+is_performant :: proc "contextless" () -> bool {
+	when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 || ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 {
+		when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
+			req_features :: info.CPU_Features{.asimd}
+		} else when ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 {
+			req_features :: info.CPU_Features{.sse2, .ssse3}
+		}
+
+		features, ok := info.cpu_features.?
+		if !ok {
+			return false
+		}
+
+		return features >= req_features
+	} else when ODIN_ARCH == .wasm64p32 || ODIN_ARCH == .wasm32 {
+		return intrinsics.has_target_feature("simd128")
+	} else {
+		return false
+	}
+}
+
+@(enable_target_feature = TARGET_SIMD_FEATURES)
+stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) {
+	// Enforce the maximum consumed keystream per IV.
+	_chacha20.check_counter_limit(ctx, nr_blocks)
+
+	dst_v := ([^]simd.u32x4)(raw_data(dst))
+	src_v := ([^]simd.u32x4)(raw_data(src))
+
+	x := &ctx._s
+	n := nr_blocks
+
+	// The state vector is an array of uint32s in native byte-order.
+	x_v := ([^]simd.u32x4)(raw_data(x))
+	s0 := intrinsics.unaligned_load((^simd.u32x4)(x_v[0:]))
+	s1 := intrinsics.unaligned_load((^simd.u32x4)(x_v[1:]))
+	s2 := intrinsics.unaligned_load((^simd.u32x4)(x_v[2:]))
+	s3 := intrinsics.unaligned_load((^simd.u32x4)(x_v[3:]))
+
+	// 8 blocks at a time.
+	//
+	// Note: This is only worth it on Aarch64.
+	when ODIN_ARCH == .arm64 {
+		for ; n >= 8; n = n - 8 {
+			v0, v1, v2, v3 := s0, s1, s2, s3
+
+			when ODIN_ENDIAN == .Little {
+				s7 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s3, _VEC_ONE)
+			} else {
+				s7 := _increment_counter(ctx)
+			}
+			v4, v5, v6, v7 := s0, s1, s2, s7
+
+			when ODIN_ENDIAN == .Little {
+				s11 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s7, _VEC_ONE)
+			} else {
+				s11 := _increment_counter(ctx)
+			}
+			v8, v9, v10, v11 := s0, s1, s2, s11
+
+			when ODIN_ENDIAN == .Little {
+				s15 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s11, _VEC_ONE)
+			} else {
+				s15 := _increment_counter(ctx)
+			}
+			v12, v13, v14, v15 := s0, s1, s2, s15
+
+			when ODIN_ENDIAN == .Little {
+				s19 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s15, _VEC_ONE)
+			} else {
+				s19 := _increment_counter(ctx)
+			}
+
+			v16, v17, v18, v19 := s0, s1, s2, s19
+			when ODIN_ENDIAN == .Little {
+				s23 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s19, _VEC_ONE)
+			} else {
+				s23 := _increment_counter(ctx)
+			}
+
+			v20, v21, v22, v23 := s0, s1, s2, s23
+			when ODIN_ENDIAN == .Little {
+				s27 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s23, _VEC_ONE)
+			} else {
+				s27 := _increment_counter(ctx)
+			}
+
+			v24, v25, v26, v27 := s0, s1, s2, s27
+			when ODIN_ENDIAN == .Little {
+				s31 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s27, _VEC_ONE)
+			} else {
+				s31 := _increment_counter(ctx)
+			}
+			v28, v29, v30, v31 := s0, s1, s2, s31
+
+			for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
+				v0, v1, v2, v3 = _dq_round_simd128(v0, v1, v2, v3)
+				v4, v5, v6, v7 = _dq_round_simd128(v4, v5, v6, v7)
+				v8, v9, v10, v11 = _dq_round_simd128(v8, v9, v10, v11)
+				v12, v13, v14, v15 = _dq_round_simd128(v12, v13, v14, v15)
+				v16, v17, v18, v19 = _dq_round_simd128(v16, v17, v18, v19)
+				v20, v21, v22, v23 = _dq_round_simd128(v20, v21, v22, v23)
+				v24, v25, v26, v27 = _dq_round_simd128(v24, v25, v26, v27)
+				v28, v29, v30, v31 = _dq_round_simd128(v28, v29, v30, v31)
+			}
+
+			v0, v1, v2, v3 = _add_state_simd128(v0, v1, v2, v3, s0, s1, s2, s3)
+			v4, v5, v6, v7 = _add_state_simd128(v4, v5, v6, v7, s0, s1, s2, s7)
+			v8, v9, v10, v11 = _add_state_simd128(v8, v9, v10, v11, s0, s1, s2, s11)
+			v12, v13, v14, v15 = _add_state_simd128(v12, v13, v14, v15, s0, s1, s2, s15)
+			v16, v17, v18, v19 = _add_state_simd128(v16, v17, v18, v19, s0, s1, s2, s19)
+			v20, v21, v22, v23 = _add_state_simd128(v20, v21, v22, v23, s0, s1, s2, s23)
+			v24, v25, v26, v27 = _add_state_simd128(v24, v25, v26, v27, s0, s1, s2, s27)
+			v28, v29, v30, v31 = _add_state_simd128(v28, v29, v30, v31, s0, s1, s2, s31)
+
+			#no_bounds_check {
+				if src != nil {
+					v0, v1, v2, v3 = _xor_simd128(src_v, v0, v1, v2, v3)
+					v4, v5, v6, v7 = _xor_simd128(src_v[4:], v4, v5, v6, v7)
+					v8, v9, v10, v11 = _xor_simd128(src_v[8:], v8, v9, v10, v11)
+					v12, v13, v14, v15 = _xor_simd128(src_v[12:], v12, v13, v14, v15)
+					v16, v17, v18, v19 = _xor_simd128(src_v[16:], v16, v17, v18, v19)
+					v20, v21, v22, v23 = _xor_simd128(src_v[20:], v20, v21, v22, v23)
+					v24, v25, v26, v27 = _xor_simd128(src_v[24:], v24, v25, v26, v27)
+					v28, v29, v30, v31 = _xor_simd128(src_v[28:], v28, v29, v30, v31)
+					src_v = src_v[32:]
+				}
+
+				_store_simd128(dst_v, v0, v1, v2, v3)
+				_store_simd128(dst_v[4:], v4, v5, v6, v7)
+				_store_simd128(dst_v[8:], v8, v9, v10, v11)
+				_store_simd128(dst_v[12:], v12, v13, v14, v15)
+				_store_simd128(dst_v[16:], v16, v17, v18, v19)
+				_store_simd128(dst_v[20:], v20, v21, v22, v23)
+				_store_simd128(dst_v[24:], v24, v25, v26, v27)
+				_store_simd128(dst_v[28:], v28, v29, v30, v31)
+				dst_v = dst_v[32:]
+			}
+
+			when ODIN_ENDIAN == .Little {
+				// s31 holds the most current counter, so `s3 = s31 + 1`.
+				s3 = transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s31, _VEC_ONE)
+			} else {
+				s3 = _increment_counter(ctx)
+			}
+		}
+	}
+
+	// 4 blocks at a time.
+	//
+	// Note: The i386 target lacks the required number of registers
+	// for this to be performant, so it is skipped.
+	when ODIN_ARCH != .i386 {
+		for ; n >= 4; n = n - 4 {
+			v0, v1, v2, v3 := s0, s1, s2, s3
+
+			when ODIN_ENDIAN == .Little {
+				s7 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s3, _VEC_ONE)
+			} else {
+				s7 := _increment_counter(ctx)
+			}
+			v4, v5, v6, v7 := s0, s1, s2, s7
+
+			when ODIN_ENDIAN == .Little {
+				s11 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s7, _VEC_ONE)
+			} else {
+				s11 := _increment_counter(ctx)
+			}
+			v8, v9, v10, v11 := s0, s1, s2, s11
+
+			when ODIN_ENDIAN == .Little {
+				s15 := transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s11, _VEC_ONE)
+			} else {
+				s15 := _increment_counter(ctx)
+			}
+			v12, v13, v14, v15 := s0, s1, s2, s15
+
+			for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
+				v0, v1, v2, v3 = _dq_round_simd128(v0, v1, v2, v3)
+				v4, v5, v6, v7 = _dq_round_simd128(v4, v5, v6, v7)
+				v8, v9, v10, v11 = _dq_round_simd128(v8, v9, v10, v11)
+				v12, v13, v14, v15 = _dq_round_simd128(v12, v13, v14, v15)
+			}
+
+			v0, v1, v2, v3 = _add_state_simd128(v0, v1, v2, v3, s0, s1, s2, s3)
+			v4, v5, v6, v7 = _add_state_simd128(v4, v5, v6, v7, s0, s1, s2, s7)
+			v8, v9, v10, v11 = _add_state_simd128(v8, v9, v10, v11, s0, s1, s2, s11)
+			v12, v13, v14, v15 = _add_state_simd128(v12, v13, v14, v15, s0, s1, s2, s15)
+
+			#no_bounds_check {
+				if src != nil {
+					v0, v1, v2, v3 = _xor_simd128(src_v, v0, v1, v2, v3)
+					v4, v5, v6, v7 = _xor_simd128(src_v[4:], v4, v5, v6, v7)
+					v8, v9, v10, v11 = _xor_simd128(src_v[8:], v8, v9, v10, v11)
+					v12, v13, v14, v15 = _xor_simd128(src_v[12:], v12, v13, v14, v15)
+					src_v = src_v[16:]
+				}
+
+				_store_simd128(dst_v, v0, v1, v2, v3)
+				_store_simd128(dst_v[4:], v4, v5, v6, v7)
+				_store_simd128(dst_v[8:], v8, v9, v10, v11)
+				_store_simd128(dst_v[12:], v12, v13, v14, v15)
+				dst_v = dst_v[16:]
+			}
+
+			when ODIN_ENDIAN == .Little {
+				// s15 holds the most current counter, so `s3 = s15 + 1`.
+				s3 = transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s15, _VEC_ONE)
+			} else {
+				s3 = _increment_counter(ctx)
+			}
+		}
+	}
+
+	// 1 block at a time.
+	for ; n > 0; n = n - 1 {
+		v0, v1, v2, v3 := s0, s1, s2, s3
+
+		for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
+			v0, v1, v2, v3 = _dq_round_simd128(v0, v1, v2, v3)
+		}
+		v0, v1, v2, v3 = _add_state_simd128(v0, v1, v2, v3, s0, s1, s2, s3)
+
+		#no_bounds_check {
+			if src != nil {
+				v0, v1, v2, v3 = _xor_simd128(src_v, v0, v1, v2, v3)
+				src_v = src_v[4:]
+			}
+
+			_store_simd128(dst_v, v0, v1, v2, v3)
+			dst_v = dst_v[4:]
+		}
+
+		// Increment the counter.  Overflow checking is done upon
+		// entry into the routine, so a 64-bit increment safely
+		// covers both cases.
+		when ODIN_ENDIAN == .Little {
+			s3 = transmute(simd.u32x4)simd.add(transmute(simd.u64x2)s3, _VEC_ONE)
+		} else {
+			s3 = _increment_counter(ctx)
+		}
+	}
+
+	when ODIN_ENDIAN == .Little {
+		// Write back the counter to the state.
+		intrinsics.unaligned_store((^simd.u32x4)(x_v[3:]), s3)
+	}
+}
+
+@(enable_target_feature = TARGET_SIMD_FEATURES)
+hchacha20 :: proc "contextless" (dst, key, iv: []byte) {
+	v0 := simd.u32x4{_chacha20.SIGMA_0, _chacha20.SIGMA_1, _chacha20.SIGMA_2, _chacha20.SIGMA_3}
+	v1 := intrinsics.unaligned_load((^simd.u32x4)(&key[0]))
+	v2 := intrinsics.unaligned_load((^simd.u32x4)(&key[16]))
+	v3 := intrinsics.unaligned_load((^simd.u32x4)(&iv[0]))
+
+	when ODIN_ENDIAN == .Big {
+		v1 = _byteswap_u32x4(v1)
+		v2 = _byteswap_u32x4(v2)
+		v3 = _byteswap_u32x4(v3)
+	}
+
+	for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
+		v0, v1, v2, v3 = _dq_round_simd128(v0, v1, v2, v3)
+	}
+
+	when ODIN_ENDIAN == .Big {
+		v0 = _byteswap_u32x4(v0)
+		v3 = _byteswap_u32x4(v3)
+	}
+
+	dst_v := ([^]simd.u32x4)(raw_data(dst))
+	intrinsics.unaligned_store((^simd.u32x4)(dst_v[0:]), v0)
+	intrinsics.unaligned_store((^simd.u32x4)(dst_v[1:]), v3)
+}
@@ -0,0 +1,319 @@
+//+build amd64
+package chacha20_simd256
+
+import "base:intrinsics"
+import "core:crypto/_chacha20"
+import chacha_simd128 "core:crypto/_chacha20/simd128"
+import "core:simd"
+import "core:sys/info"
+
+// This is loosely based on Ted Krovetz's public domain C intrinsic
+// implementations.  While written using `core:simd`, this is currently
+// amd64 specific because we do not have a way to detect ARM SVE.
+//
+// See:
+// supercop-20230530/crypto_stream/chacha20/krovetz/vec128
+// supercop-20230530/crypto_stream/chacha20/krovetz/avx2
+
+#assert(ODIN_ENDIAN == .Little)
+
+@(private = "file")
+_ROT_7L: simd.u32x8 : {7, 7, 7, 7, 7, 7, 7, 7}
+@(private = "file")
+_ROT_7R: simd.u32x8 : {25, 25, 25, 25, 25, 25, 25, 25}
+@(private = "file")
+_ROT_12L: simd.u32x8 : {12, 12, 12, 12, 12, 12, 12, 12}
+@(private = "file")
+_ROT_12R: simd.u32x8 : {20, 20, 20, 20, 20, 20, 20, 20}
+@(private = "file")
+_ROT_8L: simd.u32x8 : {8, 8, 8, 8, 8, 8, 8, 8}
+@(private = "file")
+_ROT_8R: simd.u32x8 : {24, 24, 24, 24, 24, 24, 24, 24}
+@(private = "file")
+_ROT_16: simd.u32x8 : {16, 16, 16, 16, 16, 16, 16, 16}
+@(private = "file")
+_VEC_ZERO_ONE: simd.u64x4 : {0, 0, 1, 0}
+@(private = "file")
+_VEC_TWO: simd.u64x4 : {2, 0, 2, 0}
+
+// is_performant returns true iff the target and current host both support
+// "enough" SIMD to make this implementation performant.
+is_performant :: proc "contextless" () -> bool {
+	req_features :: info.CPU_Features{.avx, .avx2}
+
+	features, ok := info.cpu_features.?
+	if !ok {
+		return false
+	}
+
+	return features >= req_features
+}
+
+@(private = "file")
+_dq_round_simd256 :: #force_inline proc "contextless" (
+	v0, v1, v2, v3: simd.u32x8,
+) -> (
+	simd.u32x8,
+	simd.u32x8,
+	simd.u32x8,
+	simd.u32x8,
+) {
+	v0, v1, v2, v3 := v0, v1, v2, v3
+
+	// a += b; d ^= a; d = ROTW16(d);
+	v0 = simd.add(v0, v1)
+	v3 = simd.bit_xor(v3, v0)
+	v3 = simd.bit_xor(simd.shl(v3, _ROT_16), simd.shr(v3, _ROT_16))
+
+	// c += d; b ^= c; b = ROTW12(b);
+	v2 = simd.add(v2, v3)
+	v1 = simd.bit_xor(v1, v2)
+	v1 = simd.bit_xor(simd.shl(v1, _ROT_12L), simd.shr(v1, _ROT_12R))
+
+	// a += b; d ^= a; d = ROTW8(d);
+	v0 = simd.add(v0, v1)
+	v3 = simd.bit_xor(v3, v0)
+	v3 = simd.bit_xor(simd.shl(v3, _ROT_8L), simd.shr(v3, _ROT_8R))
+
+	// c += d; b ^= c; b = ROTW7(b);
+	v2 = simd.add(v2, v3)
+	v1 = simd.bit_xor(v1, v2)
+	v1 = simd.bit_xor(simd.shl(v1, _ROT_7L), simd.shr(v1, _ROT_7R))
+
+	// b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
+	v1 = simd.shuffle(v1, v1, 1, 2, 3, 0, 5, 6, 7, 4)
+	v2 = simd.shuffle(v2, v2, 2, 3, 0, 1, 6, 7, 4, 5)
+	v3 = simd.shuffle(v3, v3, 3, 0, 1, 2, 7, 4, 5, 6)
+
+	// a += b; d ^= a; d = ROTW16(d);
+	v0 = simd.add(v0, v1)
+	v3 = simd.bit_xor(v3, v0)
+	v3 = simd.bit_xor(simd.shl(v3, _ROT_16), simd.shr(v3, _ROT_16))
+
+	// c += d; b ^= c; b = ROTW12(b);
+	v2 = simd.add(v2, v3)
+	v1 = simd.bit_xor(v1, v2)
+	v1 = simd.bit_xor(simd.shl(v1, _ROT_12L), simd.shr(v1, _ROT_12R))
+
+	// a += b; d ^= a; d = ROTW8(d);
+	v0 = simd.add(v0, v1)
+	v3 = simd.bit_xor(v3, v0)
+	v3 = simd.bit_xor(simd.shl(v3, _ROT_8L), simd.shr(v3, _ROT_8R))
+
+	// c += d; b ^= c; b = ROTW7(b);
+	v2 = simd.add(v2, v3)
+	v1 = simd.bit_xor(v1, v2)
+	v1 = simd.bit_xor(simd.shl(v1, _ROT_7L), simd.shr(v1, _ROT_7R))
+
+	// b = ROTV3(b); c = ROTV2(c); d = ROTV1(d);
+	v1 = simd.shuffle(v1, v1, 3, 0, 1, 2, 7, 4, 5, 6)
+	v2 = simd.shuffle(v2, v2, 2, 3, 0, 1, 6, 7, 4, 5)
+	v3 = simd.shuffle(v3, v3, 1, 2, 3, 0, 5, 6, 7, 4)
+
+	return v0, v1, v2, v3
+}
+
+@(private = "file")
+_add_and_permute_state_simd256 :: #force_inline proc "contextless" (
+	v0, v1, v2, v3, s0, s1, s2, s3: simd.u32x8,
+) -> (
+	simd.u32x8,
+	simd.u32x8,
+	simd.u32x8,
+	simd.u32x8,
+) {
+	t0 := simd.add(v0, s0)
+	t1 := simd.add(v1, s1)
+	t2 := simd.add(v2, s2)
+	t3 := simd.add(v3, s3)
+
+	// Big Endian would byteswap here.
+
+	// Each of v0 .. v3 has 128-bits of keystream for 2 separate blocks.
+	// permute the state such that (r0, r1) contains block 0, and (r2, r3)
+	// contains block 1.
+	r0 := simd.shuffle(t0, t1, 0, 1, 2, 3, 8, 9, 10, 11)
+	r2 := simd.shuffle(t0, t1, 4, 5, 6, 7, 12, 13, 14, 15)
+	r1 := simd.shuffle(t2, t3, 0, 1, 2, 3, 8, 9, 10, 11)
+	r3 := simd.shuffle(t2, t3, 4, 5, 6, 7, 12, 13, 14, 15)
+
+	return r0, r1, r2, r3
+}
+
+@(private = "file")
+_xor_simd256 :: #force_inline proc "contextless" (
+	src: [^]simd.u32x8,
+	v0, v1, v2, v3: simd.u32x8,
+) -> (
+	simd.u32x8,
+	simd.u32x8,
+	simd.u32x8,
+	simd.u32x8,
+) {
+	v0, v1, v2, v3 := v0, v1, v2, v3
+
+	v0 = simd.bit_xor(v0, intrinsics.unaligned_load((^simd.u32x8)(src[0:])))
+	v1 = simd.bit_xor(v1, intrinsics.unaligned_load((^simd.u32x8)(src[1:])))
+	v2 = simd.bit_xor(v2, intrinsics.unaligned_load((^simd.u32x8)(src[2:])))
+	v3 = simd.bit_xor(v3, intrinsics.unaligned_load((^simd.u32x8)(src[3:])))
+
+	return v0, v1, v2, v3
+}
+
+@(private = "file")
+_xor_simd256_x1 :: #force_inline proc "contextless" (
+	src: [^]simd.u32x8,
+	v0, v1: simd.u32x8,
+) -> (
+	simd.u32x8,
+	simd.u32x8,
+) {
+	v0, v1 := v0, v1
+
+	v0 = simd.bit_xor(v0, intrinsics.unaligned_load((^simd.u32x8)(src[0:])))
+	v1 = simd.bit_xor(v1, intrinsics.unaligned_load((^simd.u32x8)(src[1:])))
+
+	return v0, v1
+}
+
+@(private = "file")
+_store_simd256 :: #force_inline proc "contextless" (
+	dst: [^]simd.u32x8,
+	v0, v1, v2, v3: simd.u32x8,
+) {
+	intrinsics.unaligned_store((^simd.u32x8)(dst[0:]), v0)
+	intrinsics.unaligned_store((^simd.u32x8)(dst[1:]), v1)
+	intrinsics.unaligned_store((^simd.u32x8)(dst[2:]), v2)
+	intrinsics.unaligned_store((^simd.u32x8)(dst[3:]), v3)
+}
+
+@(private = "file")
+_store_simd256_x1 :: #force_inline proc "contextless" (
+	dst: [^]simd.u32x8,
+	v0, v1: simd.u32x8,
+) {
+	intrinsics.unaligned_store((^simd.u32x8)(dst[0:]), v0)
+	intrinsics.unaligned_store((^simd.u32x8)(dst[1:]), v1)
+}
+
+@(enable_target_feature = "sse2,ssse3,avx,avx2")
+stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) {
+	// Enforce the maximum consumed keystream per IV.
+	_chacha20.check_counter_limit(ctx, nr_blocks)
+
+	dst_v := ([^]simd.u32x8)(raw_data(dst))
+	src_v := ([^]simd.u32x8)(raw_data(src))
+
+	x := &ctx._s
+	n := nr_blocks
+
+	// The state vector is an array of uint32s in native byte-order.
+	// Setup s0 .. s3 such that each register stores 2 copies of the
+	// state.
+	x_v := ([^]simd.u32x4)(raw_data(x))
+	t0 := intrinsics.unaligned_load((^simd.u32x4)(x_v[0:]))
+	t1 := intrinsics.unaligned_load((^simd.u32x4)(x_v[1:]))
+	t2 := intrinsics.unaligned_load((^simd.u32x4)(x_v[2:]))
+	t3 := intrinsics.unaligned_load((^simd.u32x4)(x_v[3:]))
+	s0 := simd.swizzle(t0, 0, 1, 2, 3, 0, 1, 2, 3)
+	s1 := simd.swizzle(t1, 0, 1, 2, 3, 0, 1, 2, 3)
+	s2 := simd.swizzle(t2, 0, 1, 2, 3, 0, 1, 2, 3)
+	s3 := simd.swizzle(t3, 0, 1, 2, 3, 0, 1, 2, 3)
+
+	// Advance the counter in the 2nd copy of the state by one.
+	s3 = transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s3, _VEC_ZERO_ONE)
+
+	// 8 blocks at a time.
+	for ; n >= 8; n = n - 8 {
+		v0, v1, v2, v3 := s0, s1, s2, s3
+
+		s7 := transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s3, _VEC_TWO)
+		v4, v5, v6, v7 := s0, s1, s2, s7
+
+		s11 := transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s7, _VEC_TWO)
+		v8, v9, v10, v11 := s0, s1, s2, s11
+
+		s15 := transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s11, _VEC_TWO)
+		v12, v13, v14, v15 := s0, s1, s2, s15
+
+		for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
+			v0, v1, v2, v3 = _dq_round_simd256(v0, v1, v2, v3)
+			v4, v5, v6, v7 = _dq_round_simd256(v4, v5, v6, v7)
+			v8, v9, v10, v11 = _dq_round_simd256(v8, v9, v10, v11)
+			v12, v13, v14, v15 = _dq_round_simd256(v12, v13, v14, v15)
+		}
+
+		v0, v1, v2, v3 = _add_and_permute_state_simd256(v0, v1, v2, v3, s0, s1, s2, s3)
+		v4, v5, v6, v7 = _add_and_permute_state_simd256(v4, v5, v6, v7, s0, s1, s2, s7)
+		v8, v9, v10, v11 = _add_and_permute_state_simd256(v8, v9, v10, v11, s0, s1, s2, s11)
+		v12, v13, v14, v15 = _add_and_permute_state_simd256(v12, v13, v14, v15, s0, s1, s2, s15)
+
+		#no_bounds_check {
+			if src != nil {
+				v0, v1, v2, v3 = _xor_simd256(src_v, v0, v1, v2, v3)
+				v4, v5, v6, v7 = _xor_simd256(src_v[4:], v4, v5, v6, v7)
+				v8, v9, v10, v11 = _xor_simd256(src_v[8:], v8, v9, v10, v11)
+				v12, v13, v14, v15 = _xor_simd256(src_v[12:], v12, v13, v14, v15)
+				src_v = src_v[16:]
+			}
+
+			_store_simd256(dst_v, v0, v1, v2, v3)
+			_store_simd256(dst_v[4:], v4, v5, v6, v7)
+			_store_simd256(dst_v[8:], v8, v9, v10, v11)
+			_store_simd256(dst_v[12:], v12, v13, v14, v15)
+			dst_v = dst_v[16:]
+		}
+
+		s3 = transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s15, _VEC_TWO)
+	}
+
+
+	// 2 (or 1) block at a time.
+	for ; n > 0; n = n - 2 {
+		v0, v1, v2, v3 := s0, s1, s2, s3
+
+		for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
+			v0, v1, v2, v3 = _dq_round_simd256(v0, v1, v2, v3)
+		}
+		v0, v1, v2, v3 = _add_and_permute_state_simd256(v0, v1, v2, v3, s0, s1, s2, s3)
+
+		if n == 1 {
+			// Note: No need to advance src_v, dst_v, or increment the counter
+			// since this is guaranteed to be the final block.
+			#no_bounds_check {
+				if src != nil {
+					v0, v1 = _xor_simd256_x1(src_v, v0, v1)
+				}
+
+				_store_simd256_x1(dst_v, v0, v1)
+			}
+			break
+		}
+
+		#no_bounds_check {
+			if src != nil {
+				v0, v1, v2, v3 = _xor_simd256(src_v, v0, v1, v2, v3)
+				src_v = src_v[4:]
+			}
+
+			_store_simd256(dst_v, v0, v1, v2, v3)
+			dst_v = dst_v[4:]
+		}
+
+		s3 = transmute(simd.u32x8)simd.add(transmute(simd.u64x4)s3, _VEC_TWO)
+	}
+
+	// Write back the counter.  Doing it this way, saves having to
+	// pull out the correct counter value from s3.
+	new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + u64(nr_blocks)
+	ctx._s[12] = u32(new_ctr)
+	ctx._s[13] = u32(new_ctr >> 32)
+}
+
+@(enable_target_feature = "sse2,ssse3,avx")
+hchacha20 :: proc "contextless" (dst, key, iv: []byte) {
+	// We can just enable AVX and call the simd128 code as going
+	// wider has 0 performance benefit, but VEX encoded instructions
+	// is nice.
+	#force_inline chacha_simd128.hchacha20(dst, key, iv)
+}
@@ -0,0 +1,17 @@
+//+build !amd64
+package chacha20_simd256
+
+import "base:intrinsics"
+import "core:crypto/_chacha20"
+
+is_performant :: proc "contextless" () -> bool {
+	return false
+}
+
+stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) {
+	panic("crypto/chacha20: simd256 implementation unsupported")
+}
+
+hchacha20 :: proc "contextless" (dst, key, iv: []byte) {
+	intrinsics.trap()
+}
@@ -0,0 +1,36 @@
+package aead
+
+// seal_oneshot encrypts the plaintext and authenticates the aad and ciphertext,
+// with the provided algorithm, key, and iv, stores the output in dst and tag.
+//
+// dst and plaintext MUST alias exactly or not at all.
+seal_oneshot :: proc(algo: Algorithm, dst, tag, key, iv, aad, plaintext: []byte, impl: Implementation = nil) {
+	ctx: Context
+	init(&ctx, algo, key, impl)
+	defer reset(&ctx)
+	seal_ctx(&ctx, dst, tag, iv, aad, plaintext)
+}
+
+// open authenticates the aad and ciphertext, and decrypts the ciphertext,
+// with the provided algorithm, key, iv, and tag, and stores the output in dst,
+// returning true iff the authentication was successful.  If authentication
+// fails, the destination buffer will be zeroed.
+//
+// dst and plaintext MUST alias exactly or not at all.
+@(require_results)
+open_oneshot :: proc(algo: Algorithm, dst, key, iv, aad, ciphertext, tag: []byte, impl: Implementation = nil) -> bool {
+	ctx: Context
+	init(&ctx, algo, key, impl)
+	defer reset(&ctx)
+	return open_ctx(&ctx, dst, iv, aad, ciphertext, tag)
+}
+
+seal :: proc {
+	seal_ctx,
+	seal_oneshot,
+}
+
+open :: proc {
+	open_ctx,
+	open_oneshot,
+}
@@ -0,0 +1,58 @@
+/*
+package aead provides a generic interface to the supported Authenticated
+Encryption with Associated Data algorithms.
+
+Both a one-shot and context based interface are provided, with similar
+usage.  If multiple messages are to be sealed/opened via the same key,
+the context based interface may be more efficient, depending on the
+algorithm.
+
+WARNING: Reusing the same key + iv to seal (encrypt) multiple messages
+results in catastrophic loss of security for most algorithms.
+
+```odin
+package aead_example
+
+import "core:bytes"
+import "core:crypto"
+import "core:crypto/aead"
+
+main :: proc() {
+	algo := aead.Algorithm.XCHACHA20POLY1305
+
+	// The example added associated data, and plaintext.
+	aad_str := "Get your ass in gear boys."
+	pt_str := "They're immanetizing the Eschaton."
+
+	aad := transmute([]byte)aad_str
+	plaintext := transmute([]byte)pt_str
+	pt_len := len(plaintext)
+
+	// Generate a random key for the purposes of illustration.
+	key := make([]byte, aead.KEY_SIZES[algo])
+	defer delete(key)
+	crypto.rand_bytes(key)
+
+	// `ciphertext || tag`, is a common way data is transmitted, so
+	// demonstrate that.
+	buf := make([]byte, pt_len + aead.TAG_SIZES[algo])
+	defer delete(buf)
+	ciphertext, tag := buf[:pt_len], buf[pt_len:]
+
+	// Seal the AAD + Plaintext.
+	iv := make([]byte, aead.IV_SIZES[algo])
+	defer delete(iv)
+	crypto.rand_bytes(iv) // Random IVs are safe with XChaCha20-Poly1305.
+	aead.seal(algo, ciphertext, tag, key, iv, aad, plaintext)
+
+	// Open the AAD + Ciphertext.
+	opened_pt := buf[:pt_len]
+	if ok := aead.open(algo, opened_pt, key, iv, aad, ciphertext, tag); !ok {
+		panic("aead example: failed to open")
+	}
+
+	assert(bytes.equal(opened_pt, plaintext))
+}
+```
+*/
+package aead
@@ -0,0 +1,187 @@
+package aead
+
+import "core:crypto/aes"
+import "core:crypto/chacha20"
+import "core:crypto/chacha20poly1305"
+import "core:reflect"
+
+// Implementation is an AEAD implementation.  Most callers will not need
+// to use this as the package will automatically select the most performant
+// implementation available.
+Implementation :: union {
+	aes.Implementation,
+	chacha20.Implementation,
+}
+
+// MAX_TAG_SIZE is the maximum size tag that can be returned by any of the
+// Algorithms supported via this package.
+MAX_TAG_SIZE :: 16
+
+// Algorithm is the algorithm identifier associated with a given Context.
+Algorithm :: enum {
+	Invalid,
+	AES_GCM_128,
+	AES_GCM_192,
+	AES_GCM_256,
+	CHACHA20POLY1305,
+	XCHACHA20POLY1305,
+}
+
+// ALGORITM_NAMES is the Agorithm to algorithm name string.
+ALGORITHM_NAMES := [Algorithm]string {
+	.Invalid           = "Invalid",
+	.AES_GCM_128       = "AES-GCM-128",
+	.AES_GCM_192       = "AES-GCM-192",
+	.AES_GCM_256       = "AES-GCM-256",
+	.CHACHA20POLY1305  = "chacha20poly1305",
+	.XCHACHA20POLY1305 = "xchacha20poly1305",
+}
+
+// TAG_SIZES is the Algorithm to tag size in bytes.
+TAG_SIZES := [Algorithm]int {
+	.Invalid           = 0,
+	.AES_GCM_128       = aes.GCM_TAG_SIZE,
+	.AES_GCM_192       = aes.GCM_TAG_SIZE,
+	.AES_GCM_256       = aes.GCM_TAG_SIZE,
+	.CHACHA20POLY1305  = chacha20poly1305.TAG_SIZE,
+	.XCHACHA20POLY1305 = chacha20poly1305.TAG_SIZE,
+}
+
+// KEY_SIZES is the Algorithm to key size in bytes.
+KEY_SIZES := [Algorithm]int {
+	.Invalid           = 0,
+	.AES_GCM_128       = aes.KEY_SIZE_128,
+	.AES_GCM_192       = aes.KEY_SIZE_192,
+	.AES_GCM_256       = aes.KEY_SIZE_256,
+	.CHACHA20POLY1305  = chacha20poly1305.KEY_SIZE,
+	.XCHACHA20POLY1305 = chacha20poly1305.KEY_SIZE,
+}
+
+// IV_SIZES is the Algorithm to initialization vector size in bytes.
+//
+// Note: Some algorithms (such as AES-GCM) support variable IV sizes.
+IV_SIZES := [Algorithm]int {
+	.Invalid           = 0,
+	.AES_GCM_128       = aes.GCM_IV_SIZE,
+	.AES_GCM_192       = aes.GCM_IV_SIZE,
+	.AES_GCM_256       = aes.GCM_IV_SIZE,
+	.CHACHA20POLY1305  = chacha20poly1305.IV_SIZE,
+	.XCHACHA20POLY1305 = chacha20poly1305.XIV_SIZE,
+}
+
+// Context is a concrete instantiation of a specific AEAD algorithm.
+Context :: struct {
+	_algo: Algorithm,
+	_impl: union {
+		aes.Context_GCM,
+		chacha20poly1305.Context,
+	},
+}
+
+@(private)
+_IMPL_IDS := [Algorithm]typeid {
+	.Invalid           = nil,
+	.AES_GCM_128       = typeid_of(aes.Context_GCM),
+	.AES_GCM_192       = typeid_of(aes.Context_GCM),
+	.AES_GCM_256       = typeid_of(aes.Context_GCM),
+	.CHACHA20POLY1305  = typeid_of(chacha20poly1305.Context),
+	.XCHACHA20POLY1305 = typeid_of(chacha20poly1305.Context),
+}
+
+// init initializes a Context with a specific AEAD Algorithm.
+init :: proc(ctx: ^Context, algorithm: Algorithm, key: []byte, impl: Implementation = nil) {
+	if ctx._impl != nil {
+		reset(ctx)
+	}
+
+	if len(key) != KEY_SIZES[algorithm] {
+		panic("crypto/aead: invalid key size")
+	}
+
+	// Directly specialize the union by setting the type ID (save a copy).
+	reflect.set_union_variant_typeid(
+		ctx._impl,
+		_IMPL_IDS[algorithm],
+	)
+	switch algorithm {
+	case .AES_GCM_128, .AES_GCM_192, .AES_GCM_256:
+		impl_ := impl != nil ? impl.(aes.Implementation) : aes.DEFAULT_IMPLEMENTATION
+		aes.init_gcm(&ctx._impl.(aes.Context_GCM), key, impl_)
+	case .CHACHA20POLY1305:
+		impl_ := impl != nil ? impl.(chacha20.Implementation) : chacha20.DEFAULT_IMPLEMENTATION
+		chacha20poly1305.init(&ctx._impl.(chacha20poly1305.Context), key, impl_)
+	case .XCHACHA20POLY1305:
+		impl_ := impl != nil ? impl.(chacha20.Implementation) : chacha20.DEFAULT_IMPLEMENTATION
+		chacha20poly1305.init_xchacha(&ctx._impl.(chacha20poly1305.Context), key, impl_)
+	case .Invalid:
+		panic("crypto/aead: uninitialized algorithm")
+	case:
+		panic("crypto/aead: invalid algorithm")
+	}
+
+	ctx._algo = algorithm
+}
+
+// seal_ctx encrypts the plaintext and authenticates the aad and ciphertext,
+// with the provided Context and iv, stores the output in dst and tag.
+//
+// dst and plaintext MUST alias exactly or not at all.
+seal_ctx :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {
+	switch &impl in ctx._impl {
+	case aes.Context_GCM:
+		aes.seal_gcm(&impl, dst, tag, iv, aad, plaintext)
+	case chacha20poly1305.Context:
+		chacha20poly1305.seal(&impl, dst, tag, iv, aad, plaintext)
+	case:
+		panic("crypto/aead: uninitialized algorithm")
+	}
+}
+
+// open_ctx authenticates the aad and ciphertext, and decrypts the ciphertext,
+// with the provided Context, iv, and tag, and stores the output in dst,
+// returning true iff the authentication was successful.  If authentication
+// fails, the destination buffer will be zeroed.
+//
+// dst and plaintext MUST alias exactly or not at all.
+@(require_results)
+open_ctx :: proc(ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool {
+	switch &impl in ctx._impl {
+	case aes.Context_GCM:
+		return aes.open_gcm(&impl, dst, iv, aad, ciphertext, tag)
+	case chacha20poly1305.Context:
+		return chacha20poly1305.open(&impl, dst, iv, aad, ciphertext, tag)
+	case:
+		panic("crypto/aead: uninitialized algorithm")
+	}
+}
+
+// reset sanitizes the Context.  The Context must be re-initialized to
+// be used again.
+reset :: proc(ctx: ^Context) {
+	switch &impl in ctx._impl {
+	case aes.Context_GCM:
+		aes.reset_gcm(&impl)
+	case chacha20poly1305.Context:
+		chacha20poly1305.reset(&impl)
+	case:
+		// Calling reset repeatedly is fine.
+	}
+
+	ctx._algo = .Invalid
+	ctx._impl = nil
+}
+
+// algorithm returns the Algorithm used by a Context instance.
+algorithm :: proc(ctx: ^Context) -> Algorithm {
+	return ctx._algo
+}
+
+// iv_size returns the IV size of a Context instance in bytes.
+iv_size :: proc(ctx: ^Context) -> int {
+	return IV_SIZES[ctx._algo]
+}
+
+// tag_size returns the tag size of a Context instance in bytes.
+tag_size :: proc(ctx: ^Context) -> int {
+	return TAG_SIZES[ctx._algo]
+}
@@ -6,7 +6,6 @@ See:
 - https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38a.pdf
 - https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38d.pdf
 */
-
 package aes

 import "core:crypto/_aes"
@@ -1,5 +1,6 @@
 package aes

+import "core:bytes"
 import "core:crypto/_aes/ct64"
 import "core:encoding/endian"
 import "core:math/bits"
@@ -19,7 +20,7 @@ Context_CTR :: struct {
 }

 // init_ctr initializes a Context_CTR with the provided key and IV.
-init_ctr :: proc(ctx: ^Context_CTR, key, iv: []byte, impl := Implementation.Hardware) {
+init_ctr :: proc(ctx: ^Context_CTR, key, iv: []byte, impl := DEFAULT_IMPLEMENTATION) {
 	if len(iv) != CTR_IV_SIZE {
 		panic("crypto/aes: invalid CTR IV size")
 	}
@@ -37,15 +38,16 @@ init_ctr :: proc(ctx: ^Context_CTR, key, iv: []byte, impl := Implementation.Hard
 xor_bytes_ctr :: proc(ctx: ^Context_CTR, dst, src: []byte) {
 	assert(ctx._is_initialized)

-	// TODO: Enforcing that dst and src alias exactly or not at all
-	// is a good idea, though odd aliasing should be extremely uncommon.
-
 	src, dst := src, dst
 	if dst_len := len(dst); dst_len < len(src) {
 		src = src[:dst_len]
 	}

-	for remaining := len(src); remaining > 0; {
+	if bytes.alias_inexactly(dst, src) {
+		panic("crypto/aes: dst and src alias inexactly")
+	}
+
+	#no_bounds_check for remaining := len(src); remaining > 0; {
 		// Process multiple blocks at once
 		if ctx._off == BLOCK_SIZE {
 			if nr_blocks := remaining / BLOCK_SIZE; nr_blocks > 0 {
@@ -83,7 +85,7 @@ keystream_bytes_ctr :: proc(ctx: ^Context_CTR, dst: []byte) {
 	assert(ctx._is_initialized)

 	dst := dst
-	for remaining := len(dst); remaining > 0; {
+	#no_bounds_check for remaining := len(dst); remaining > 0; {
 		// Process multiple blocks at once
 		if ctx._off == BLOCK_SIZE {
 			if nr_blocks := remaining / BLOCK_SIZE; nr_blocks > 0 {
@@ -123,8 +125,8 @@ reset_ctr :: proc "contextless" (ctx: ^Context_CTR) {
 	ctx._is_initialized = false
 }

-@(private)
-ctr_blocks :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) {
+@(private = "file")
+ctr_blocks :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_bounds_check {
 	// Use the optimized hardware implementation if available.
 	if _, is_hw := ctx._impl.(Context_Impl_Hardware); is_hw {
 		ctr_blocks_hw(ctx, dst, src, nr_blocks)
@@ -183,17 +185,17 @@ xor_blocks :: #force_inline proc "contextless" (dst, src: []byte, blocks: [][]by
 	// performance of this implementation matters to where that
 	// optimization would be worth it, use chacha20poly1305, or a
 	// CPU that isn't e-waste.
-	if src != nil {
-		#no_bounds_check {
-			for i in 0 ..< len(blocks) {
-				off := i * BLOCK_SIZE
-				for j in 0 ..< BLOCK_SIZE {
-					blocks[i][j] ~= src[off + j]
+	#no_bounds_check {
+		if src != nil {
+				for i in 0 ..< len(blocks) {
+					off := i * BLOCK_SIZE
+					for j in 0 ..< BLOCK_SIZE {
+						blocks[i][j] ~= src[off + j]
+					}
 				}
-			}
+		}
+		for i in 0 ..< len(blocks) {
+			copy(dst[i * BLOCK_SIZE:], blocks[i])
 		}
 	}
-	for i in 0 ..< len(blocks) {
-		copy(dst[i * BLOCK_SIZE:], blocks[i])
-	}
 }
@@ -0,0 +1,151 @@
+//+build amd64
+package aes
+
+import "base:intrinsics"
+import "core:crypto/_aes"
+import "core:math/bits"
+import "core:mem"
+import "core:simd/x86"
+
+@(private)
+CTR_STRIDE_HW :: 4
+@(private)
+CTR_STRIDE_BYTES_HW :: CTR_STRIDE_HW * BLOCK_SIZE
+
+@(private, enable_target_feature = "sse2,aes")
+ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_bounds_check {
+	hw_ctx := ctx._impl.(Context_Impl_Hardware)
+
+	sks: [15]x86.__m128i = ---
+	for i in 0 ..= hw_ctx._num_rounds {
+		sks[i] = intrinsics.unaligned_load((^x86.__m128i)(&hw_ctx._sk_exp_enc[i]))
+	}
+
+	hw_inc_ctr := #force_inline proc "contextless" (hi, lo: u64) -> (x86.__m128i, u64, u64) {
+		ret := x86.__m128i{
+			i64(intrinsics.byte_swap(hi)),
+			i64(intrinsics.byte_swap(lo)),
+		}
+
+		hi, lo := hi, lo
+		carry: u64
+
+		lo, carry = bits.add_u64(lo, 1, 0)
+		hi, _ = bits.add_u64(hi, 0, carry)
+		return ret, hi, lo
+	}
+
+	// The latency of AESENC depends on mfg and microarchitecture:
+	// - 7 -> up to Broadwell
+	// - 4 -> AMD and Skylake - Cascade Lake
+	// - 3 -> Ice Lake and newer
+	//
+	// This implementation does 4 blocks at once, since performance
+	// should be "adequate" across most CPUs.
+
+	src, dst := src, dst
+	nr_blocks := nr_blocks
+	ctr_hi, ctr_lo := ctx._ctr_hi, ctx._ctr_lo
+
+	blks: [CTR_STRIDE_HW]x86.__m128i = ---
+	for nr_blocks >= CTR_STRIDE_HW {
+		#unroll for i in 0..< CTR_STRIDE_HW {
+			blks[i], ctr_hi, ctr_lo = hw_inc_ctr(ctr_hi, ctr_lo)
+		}
+
+		#unroll for i in 0 ..< CTR_STRIDE_HW {
+			blks[i] = x86._mm_xor_si128(blks[i], sks[0])
+		}
+		#unroll for i in 1 ..= 9 {
+			#unroll for j in 0 ..< CTR_STRIDE_HW {
+				blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
+			}
+		}
+		switch hw_ctx._num_rounds {
+		case _aes.ROUNDS_128:
+			#unroll for i in 0 ..< CTR_STRIDE_HW {
+				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[10])
+			}
+		case _aes.ROUNDS_192:
+			#unroll for i in 10 ..= 11 {
+				#unroll for j in 0 ..< CTR_STRIDE_HW {
+					blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
+				}
+			}
+			#unroll for i in 0 ..< CTR_STRIDE_HW {
+				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[12])
+			}
+		case _aes.ROUNDS_256:
+			#unroll for i in 10 ..= 13 {
+				#unroll for j in 0 ..< CTR_STRIDE_HW {
+					blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
+				}
+			}
+			#unroll for i in 0 ..< CTR_STRIDE_HW {
+				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[14])
+			}
+		}
+
+		xor_blocks_hw(dst, src, blks[:])
+
+		if src != nil {
+			src = src[CTR_STRIDE_BYTES_HW:]
+		}
+		dst = dst[CTR_STRIDE_BYTES_HW:]
+		nr_blocks -= CTR_STRIDE_HW
+	}
+
+	// Handle the remainder.
+	for nr_blocks > 0 {
+		blks[0], ctr_hi, ctr_lo = hw_inc_ctr(ctr_hi, ctr_lo)
+
+		blks[0] = x86._mm_xor_si128(blks[0], sks[0])
+		#unroll for i in 1 ..= 9 {
+			blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
+		}
+		switch hw_ctx._num_rounds {
+		case _aes.ROUNDS_128:
+			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[10])
+		case _aes.ROUNDS_192:
+			#unroll for i in 10 ..= 11 {
+				blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
+			}
+			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[12])
+		case _aes.ROUNDS_256:
+			#unroll for i in 10 ..= 13 {
+				blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
+			}
+			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[14])
+		}
+
+		xor_blocks_hw(dst, src, blks[:1])
+
+		if src != nil {
+			src = src[BLOCK_SIZE:]
+		}
+		dst = dst[BLOCK_SIZE:]
+		nr_blocks -= 1
+	}
+
+	// Write back the counter.
+	ctx._ctr_hi, ctx._ctr_lo = ctr_hi, ctr_lo
+
+	mem.zero_explicit(&blks, size_of(blks))
+	mem.zero_explicit(&sks, size_of(sks))
+}
+
+@(private, enable_target_feature = "sse2")
+xor_blocks_hw :: proc(dst, src: []byte, blocks: []x86.__m128i) {
+	#no_bounds_check {
+		if src != nil {
+				for i in 0 ..< len(blocks) {
+					off := i * BLOCK_SIZE
+					tmp := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[off:])))
+					blocks[i] = x86._mm_xor_si128(blocks[i], tmp)
+				}
+		}
+		for i in 0 ..< len(blocks) {
+			intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst[i * BLOCK_SIZE:])), blocks[i])
+		}
+	}
+}
@@ -12,7 +12,7 @@ Context_ECB :: struct {
 }

 // init_ecb initializes a Context_ECB with the provided key.
-init_ecb :: proc(ctx: ^Context_ECB, key: []byte, impl := Implementation.Hardware) {
+init_ecb :: proc(ctx: ^Context_ECB, key: []byte, impl := DEFAULT_IMPLEMENTATION) {
 	init_impl(&ctx._impl, key, impl)
 	ctx._is_initialized = true
 }
@@ -0,0 +1,58 @@
+//+build amd64
+package aes
+
+import "base:intrinsics"
+import "core:crypto/_aes"
+import "core:simd/x86"
+
+@(private, enable_target_feature = "sse2,aes")
+encrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
+	blk := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src)))
+
+	blk = x86._mm_xor_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[0])))
+	#unroll for i in 1 ..= 9 {
+		blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])))
+	}
+	switch ctx._num_rounds {
+	case _aes.ROUNDS_128:
+		blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[10])))
+	case _aes.ROUNDS_192:
+		#unroll for i in 10 ..= 11 {
+			blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])))
+		}
+		blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[12])))
+	case _aes.ROUNDS_256:
+		#unroll for i in 10 ..= 13 {
+			blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])))
+		}
+		blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[14])))
+	}
+
+	intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), blk)
+}
+
+@(private, enable_target_feature = "sse2,aes")
+decrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
+	blk := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src)))
+
+	blk = x86._mm_xor_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[0])))
+	#unroll for i in 1 ..= 9 {
+		blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i])))
+	}
+	switch ctx._num_rounds {
+	case _aes.ROUNDS_128:
+		blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[10])))
+	case _aes.ROUNDS_192:
+		#unroll for i in 10 ..= 11 {
+			blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i])))
+		}
+		blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[12])))
+	case _aes.ROUNDS_256:
+		#unroll for i in 10 ..= 13 {
+			blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i])))
+		}
+		blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[14])))
+	}
+
+	intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), blk)
+}
@@ -1,13 +1,16 @@
 package aes

+import "core:bytes"
 import "core:crypto"
 import "core:crypto/_aes"
 import "core:crypto/_aes/ct64"
 import "core:encoding/endian"
 import "core:mem"

-// GCM_NONCE_SIZE is the size of the GCM nonce in bytes.
-GCM_NONCE_SIZE :: 12
+// GCM_IV_SIZE is the default size of the GCM IV in bytes.
+GCM_IV_SIZE :: 12
+// GCM_IV_SIZE_MAX is the maximum size of the GCM IV in bytes.
+GCM_IV_SIZE_MAX :: 0x2000000000000000 // floor((2^64 - 1) / 8) bits
 // GCM_TAG_SIZE is the size of a GCM tag in bytes.
 GCM_TAG_SIZE :: _aes.GHASH_TAG_SIZE

@@ -23,69 +26,79 @@ Context_GCM :: struct {
 }

 // init_gcm initializes a Context_GCM with the provided key.
-init_gcm :: proc(ctx: ^Context_GCM, key: []byte, impl := Implementation.Hardware) {
+init_gcm :: proc(ctx: ^Context_GCM, key: []byte, impl := DEFAULT_IMPLEMENTATION) {
 	init_impl(&ctx._impl, key, impl)
 	ctx._is_initialized = true
 }

 // seal_gcm encrypts the plaintext and authenticates the aad and ciphertext,
-// with the provided Context_GCM and nonce, stores the output in dst and tag.
+// with the provided Context_GCM and iv, stores the output in dst and tag.
 //
 // dst and plaintext MUST alias exactly or not at all.
-seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, nonce, aad, plaintext: []byte) {
+seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, iv, aad, plaintext: []byte) {
 	assert(ctx._is_initialized)

-	gcm_validate_common_slice_sizes(tag, nonce, aad, plaintext)
+	gcm_validate_common_slice_sizes(tag, iv, aad, plaintext)
 	if len(dst) != len(plaintext) {
 		panic("crypto/aes: invalid destination ciphertext size")
 	}
+	if bytes.alias_inexactly(dst, plaintext) {
+		panic("crypto/aes: dst and plaintext alias inexactly")
+	}

 	if impl, is_hw := ctx._impl.(Context_Impl_Hardware); is_hw {
-		gcm_seal_hw(&impl, dst, tag, nonce, aad, plaintext)
+		gcm_seal_hw(&impl, dst, tag, iv, aad, plaintext)
 		return
 	}

 	h: [_aes.GHASH_KEY_SIZE]byte
 	j0: [_aes.GHASH_BLOCK_SIZE]byte
+	j0_enc: [_aes.GHASH_BLOCK_SIZE]byte
 	s: [_aes.GHASH_TAG_SIZE]byte
-	init_ghash_ct64(ctx, &h, &j0, nonce)
+	init_ghash_ct64(ctx, &h, &j0, &j0_enc, iv)

 	// Note: Our GHASH implementation handles appending padding.
 	ct64.ghash(s[:], h[:], aad)
-	gctr_ct64(ctx, dst, &s, plaintext, &h, nonce, true)
-	final_ghash_ct64(&s, &h, &j0, len(aad), len(plaintext))
+	gctr_ct64(ctx, dst, &s, plaintext, &h, &j0, true)
+	final_ghash_ct64(&s, &h, &j0_enc, len(aad), len(plaintext))
 	copy(tag, s[:])

 	mem.zero_explicit(&h, len(h))
 	mem.zero_explicit(&j0, len(j0))
+	mem.zero_explicit(&j0_enc, len(j0_enc))
 }

 // open_gcm authenticates the aad and ciphertext, and decrypts the ciphertext,
-// with the provided Context_GCM, nonce, and tag, and stores the output in dst,
+// with the provided Context_GCM, iv, and tag, and stores the output in dst,
 // returning true iff the authentication was successful.  If authentication
 // fails, the destination buffer will be zeroed.
 //
 // dst and plaintext MUST alias exactly or not at all.
-open_gcm :: proc(ctx: ^Context_GCM, dst, nonce, aad, ciphertext, tag: []byte) -> bool {
+@(require_results)
+open_gcm :: proc(ctx: ^Context_GCM, dst, iv, aad, ciphertext, tag: []byte) -> bool {
 	assert(ctx._is_initialized)

-	gcm_validate_common_slice_sizes(tag, nonce, aad, ciphertext)
+	gcm_validate_common_slice_sizes(tag, iv, aad, ciphertext)
 	if len(dst) != len(ciphertext) {
 		panic("crypto/aes: invalid destination plaintext size")
 	}
+	if bytes.alias_inexactly(dst, ciphertext) {
+		panic("crypto/aes: dst and ciphertext alias inexactly")
+	}

 	if impl, is_hw := ctx._impl.(Context_Impl_Hardware); is_hw {
-		return gcm_open_hw(&impl, dst, nonce, aad, ciphertext, tag)
+		return gcm_open_hw(&impl, dst, iv, aad, ciphertext, tag)
 	}

 	h: [_aes.GHASH_KEY_SIZE]byte
 	j0: [_aes.GHASH_BLOCK_SIZE]byte
+	j0_enc: [_aes.GHASH_BLOCK_SIZE]byte
 	s: [_aes.GHASH_TAG_SIZE]byte
-	init_ghash_ct64(ctx, &h, &j0, nonce)
+	init_ghash_ct64(ctx, &h, &j0, &j0_enc, iv)

 	ct64.ghash(s[:], h[:], aad)
-	gctr_ct64(ctx, dst, &s, ciphertext, &h, nonce, false)
-	final_ghash_ct64(&s, &h, &j0, len(aad), len(ciphertext))
+	gctr_ct64(ctx, dst, &s, ciphertext, &h, &j0, false)
+	final_ghash_ct64(&s, &h, &j0_enc, len(aad), len(ciphertext))

 	ok := crypto.compare_constant_time(s[:], tag) == 1
 	if !ok {
@@ -94,32 +107,28 @@ open_gcm :: proc(ctx: ^Context_GCM, dst, nonce, aad, ciphertext, tag: []byte) ->

 	mem.zero_explicit(&h, len(h))
 	mem.zero_explicit(&j0, len(j0))
+	mem.zero_explicit(&j0_enc, len(j0_enc))
 	mem.zero_explicit(&s, len(s))

 	return ok
 }

-// reset_ctr sanitizes the Context_GCM.  The Context_GCM must be
+// reset_gcm sanitizes the Context_GCM.  The Context_GCM must be
 // re-initialized to be used again.
 reset_gcm :: proc "contextless" (ctx: ^Context_GCM) {
 	reset_impl(&ctx._impl)
 	ctx._is_initialized = false
 }

-@(private)
-gcm_validate_common_slice_sizes :: proc(tag, nonce, aad, text: []byte) {
+@(private = "file")
+gcm_validate_common_slice_sizes :: proc(tag, iv, aad, text: []byte) {
 	if len(tag) != GCM_TAG_SIZE {
 		panic("crypto/aes: invalid GCM tag size")
 	}

-	// The specification supports nonces in the range [1, 2^64) bits
-	// however per NIST SP 800-38D 5.2.1.1:
-	//
-	// > For IVs, it is recommended that implementations restrict support
-	// > to the length of 96 bits, to promote interoperability, efficiency,
-	// > and simplicity of design.
-	if len(nonce) != GCM_NONCE_SIZE {
-		panic("crypto/aes: invalid GCM nonce size")
+	// The specification supports IVs in the range [1, 2^64) bits.
+	if l := len(iv); l == 0 || u64(l) >= GCM_IV_SIZE_MAX {
+		panic("crypto/aes: invalid GCM IV size")
 	}

 	if aad_len := u64(len(aad)); aad_len > GCM_A_MAX {
@@ -135,19 +144,33 @@ init_ghash_ct64 :: proc(
 	ctx: ^Context_GCM,
 	h: ^[_aes.GHASH_KEY_SIZE]byte,
 	j0: ^[_aes.GHASH_BLOCK_SIZE]byte,
-	nonce: []byte,
+	j0_enc: ^[_aes.GHASH_BLOCK_SIZE]byte,
+	iv: []byte,
 ) {
 	impl := &ctx._impl.(ct64.Context)

 	// 1. Let H = CIPH(k, 0^128)
 	ct64.encrypt_block(impl, h[:], h[:])

+	// Define a block, J0, as follows:
+	if l := len(iv); l == GCM_IV_SIZE {
+		// if len(IV) = 96, then let J0 = IV || 0^31 || 1
+		copy(j0[:], iv)
+		j0[_aes.GHASH_BLOCK_SIZE - 1] = 1
+	} else {
+		// If len(IV) != 96, then let s = 128 ceil(len(IV)/128) - len(IV),
+		// and let J0 = GHASHH(IV || 0^(s+64) || ceil(len(IV))^64).
+		ct64.ghash(j0[:], h[:], iv)
+
+		tmp: [_aes.GHASH_BLOCK_SIZE]byte
+		endian.unchecked_put_u64be(tmp[8:], u64(l) * 8)
+		ct64.ghash(j0[:], h[:], tmp[:])
+	}
+
 	// ECB encrypt j0, so that we can just XOR with the tag.  In theory
 	// this could be processed along with the final GCTR block, to
 	// potentially save a call to AES-ECB, but... just use AES-NI.
-	copy(j0[:], nonce)
-	j0[_aes.GHASH_BLOCK_SIZE - 1] = 1
-	ct64.encrypt_block(impl, j0[:], j0[:])
+	ct64.encrypt_block(impl, j0_enc[:], j0[:])
 }

@(private = "file")
@@ -175,33 +198,27 @@ gctr_ct64 :: proc(
 	s: ^[_aes.GHASH_BLOCK_SIZE]byte,
 	src: []byte,
 	h: ^[_aes.GHASH_KEY_SIZE]byte,
-	nonce: []byte,
+	iv: ^[_aes.GHASH_BLOCK_SIZE]byte,
 	is_seal: bool,
-) {
+) #no_bounds_check {
 	ct64_inc_ctr32 := #force_inline proc "contextless" (dst: []byte, ctr: u32) -> u32 {
 		endian.unchecked_put_u32be(dst[12:], ctr)
 		return ctr + 1
 	}

-	// 2. Define a block J_0 as follows:
-	//    if len(IV) = 96, then let J0 = IV || 0^31 || 1
-	//
-	// Note: We only support 96 bit IVs.
+	// Setup the counter blocks.
 	tmp, tmp2: [ct64.STRIDE][BLOCK_SIZE]byte = ---, ---
 	ctrs, blks: [ct64.STRIDE][]byte = ---, ---
-	ctr: u32 = 2
+	ctr := endian.unchecked_get_u32be(iv[GCM_IV_SIZE:]) + 1
 	for i in 0 ..< ct64.STRIDE {
 		// Setup scratch space for the keystream.
 		blks[i] = tmp2[i][:]

 		// Pre-copy the IV to all the counter blocks.
 		ctrs[i] = tmp[i][:]
-		copy(ctrs[i], nonce)
+		copy(ctrs[i], iv[:GCM_IV_SIZE])
 	}

-	// We stitch the GCTR and GHASH operations together, so that only
-	// one pass over the ciphertext is required.
-
 	impl := &ctx._impl.(ct64.Context)
 	src, dst := src, dst

@@ -0,0 +1,243 @@
+//+build amd64
+package aes
+
+import "base:intrinsics"
+import "core:crypto"
+import "core:crypto/_aes"
+import "core:crypto/_aes/hw_intel"
+import "core:encoding/endian"
+import "core:mem"
+import "core:simd/x86"
+
+@(private)
+gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, iv, aad, plaintext: []byte) {
+	h: [_aes.GHASH_KEY_SIZE]byte
+	j0: [_aes.GHASH_BLOCK_SIZE]byte
+	j0_enc: [_aes.GHASH_BLOCK_SIZE]byte
+	s: [_aes.GHASH_TAG_SIZE]byte
+	init_ghash_hw(ctx, &h, &j0, &j0_enc, iv)
+
+	// Note: Our GHASH implementation handles appending padding.
+	hw_intel.ghash(s[:], h[:], aad)
+	gctr_hw(ctx, dst, &s, plaintext, &h, &j0, true)
+	final_ghash_hw(&s, &h, &j0_enc, len(aad), len(plaintext))
+	copy(tag, s[:])
+
+	mem.zero_explicit(&h, len(h))
+	mem.zero_explicit(&j0, len(j0))
+	mem.zero_explicit(&j0_enc, len(j0_enc))
+}
+
+@(private)
+gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, iv, aad, ciphertext, tag: []byte) -> bool {
+	h: [_aes.GHASH_KEY_SIZE]byte
+	j0: [_aes.GHASH_BLOCK_SIZE]byte
+	j0_enc: [_aes.GHASH_BLOCK_SIZE]byte
+	s: [_aes.GHASH_TAG_SIZE]byte
+	init_ghash_hw(ctx, &h, &j0, &j0_enc, iv)
+
+	hw_intel.ghash(s[:], h[:], aad)
+	gctr_hw(ctx, dst, &s, ciphertext, &h, &j0, false)
+	final_ghash_hw(&s, &h, &j0_enc, len(aad), len(ciphertext))
+
+	ok := crypto.compare_constant_time(s[:], tag) == 1
+	if !ok {
+		mem.zero_explicit(raw_data(dst), len(dst))
+	}
+
+	mem.zero_explicit(&h, len(h))
+	mem.zero_explicit(&j0, len(j0))
+	mem.zero_explicit(&j0_enc, len(j0_enc))
+	mem.zero_explicit(&s, len(s))
+
+	return ok
+}
+
+@(private = "file")
+init_ghash_hw :: proc(
+	ctx: ^Context_Impl_Hardware,
+	h: ^[_aes.GHASH_KEY_SIZE]byte,
+	j0: ^[_aes.GHASH_BLOCK_SIZE]byte,
+	j0_enc: ^[_aes.GHASH_BLOCK_SIZE]byte,
+	iv: []byte,
+) {
+	// 1. Let H = CIPH(k, 0^128)
+	encrypt_block_hw(ctx, h[:], h[:])
+
+	// Define a block, J0, as follows:
+	if l := len(iv); l == GCM_IV_SIZE {
+		// if len(IV) = 96, then let J0 = IV || 0^31 || 1
+		copy(j0[:], iv)
+		j0[_aes.GHASH_BLOCK_SIZE - 1] = 1
+	} else {
+		// If len(IV) != 96, then let s = 128 ceil(len(IV)/128) - len(IV),
+		// and let J0 = GHASHH(IV || 0^(s+64) || ceil(len(IV))^64).
+		hw_intel.ghash(j0[:], h[:], iv)
+
+		tmp: [_aes.GHASH_BLOCK_SIZE]byte
+		endian.unchecked_put_u64be(tmp[8:], u64(l) * 8)
+		hw_intel.ghash(j0[:], h[:], tmp[:])
+	}
+
+	// ECB encrypt j0, so that we can just XOR with the tag.
+	encrypt_block_hw(ctx, j0_enc[:], j0[:])
+}
+
+@(private = "file", enable_target_feature = "sse2")
+final_ghash_hw :: proc(
+	s: ^[_aes.GHASH_BLOCK_SIZE]byte,
+	h: ^[_aes.GHASH_KEY_SIZE]byte,
+	j0: ^[_aes.GHASH_BLOCK_SIZE]byte,
+	a_len: int,
+	t_len: int,
+) {
+	blk: [_aes.GHASH_BLOCK_SIZE]byte
+	endian.unchecked_put_u64be(blk[0:], u64(a_len) * 8)
+	endian.unchecked_put_u64be(blk[8:], u64(t_len) * 8)
+
+	hw_intel.ghash(s[:], h[:], blk[:])
+	j0_vec := intrinsics.unaligned_load((^x86.__m128i)(j0))
+	s_vec := intrinsics.unaligned_load((^x86.__m128i)(s))
+	s_vec = x86._mm_xor_si128(s_vec, j0_vec)
+	intrinsics.unaligned_store((^x86.__m128i)(s), s_vec)
+}
+
+@(private = "file", enable_target_feature = "sse2,sse4.1,aes")
+gctr_hw :: proc(
+	ctx: ^Context_Impl_Hardware,
+	dst: []byte,
+	s: ^[_aes.GHASH_BLOCK_SIZE]byte,
+	src: []byte,
+	h: ^[_aes.GHASH_KEY_SIZE]byte,
+	iv: ^[_aes.GHASH_BLOCK_SIZE]byte,
+	is_seal: bool,
+) #no_bounds_check {
+	sks: [15]x86.__m128i = ---
+	for i in 0 ..= ctx._num_rounds {
+		sks[i] = intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i]))
+	}
+
+	// Setup the counter block
+	ctr_blk := intrinsics.unaligned_load((^x86.__m128i)(iv))
+	ctr := endian.unchecked_get_u32be(iv[GCM_IV_SIZE:]) + 1
+
+	src, dst := src, dst
+
+	// Note: Instead of doing GHASH and CTR separately, it is more
+	// performant to interleave (stitch) the two operations together.
+	// This results in an unreadable mess, so we opt for simplicity
+	// as performance is adequate.
+
+	blks: [CTR_STRIDE_HW]x86.__m128i = ---
+	nr_blocks := len(src) / BLOCK_SIZE
+	for nr_blocks >= CTR_STRIDE_HW {
+		if !is_seal {
+			hw_intel.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW])
+		}
+
+		#unroll for i in 0 ..< CTR_STRIDE_HW {
+			blks[i], ctr = hw_inc_ctr32(&ctr_blk, ctr)
+		}
+
+		#unroll for i in 0 ..< CTR_STRIDE_HW {
+			blks[i] = x86._mm_xor_si128(blks[i], sks[0])
+		}
+		#unroll for i in 1 ..= 9 {
+			#unroll for j in 0 ..< CTR_STRIDE_HW {
+				blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
+			}
+		}
+		switch ctx._num_rounds {
+		case _aes.ROUNDS_128:
+			#unroll for i in 0 ..< CTR_STRIDE_HW {
+				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[10])
+			}
+		case _aes.ROUNDS_192:
+			#unroll for i in 10 ..= 11 {
+				#unroll for j in 0 ..< CTR_STRIDE_HW {
+					blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
+				}
+			}
+			#unroll for i in 0 ..< CTR_STRIDE_HW {
+				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[12])
+			}
+		case _aes.ROUNDS_256:
+			#unroll for i in 10 ..= 13 {
+				#unroll for j in 0 ..< CTR_STRIDE_HW {
+					blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
+				}
+			}
+			#unroll for i in 0 ..< CTR_STRIDE_HW {
+				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[14])
+			}
+		}
+
+		xor_blocks_hw(dst, src, blks[:])
+
+		if is_seal {
+			hw_intel.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW])
+		}
+
+		src = src[CTR_STRIDE_BYTES_HW:]
+		dst = dst[CTR_STRIDE_BYTES_HW:]
+		nr_blocks -= CTR_STRIDE_HW
+	}
+
+	// Handle the remainder.
+	for n := len(src); n > 0; {
+		l := min(n, BLOCK_SIZE)
+		if !is_seal {
+			hw_intel.ghash(s[:], h[:], src[:l])
+		}
+
+		blks[0], ctr = hw_inc_ctr32(&ctr_blk, ctr)
+
+		blks[0] = x86._mm_xor_si128(blks[0], sks[0])
+		#unroll for i in 1 ..= 9 {
+			blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
+		}
+		switch ctx._num_rounds {
+		case _aes.ROUNDS_128:
+			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[10])
+		case _aes.ROUNDS_192:
+			#unroll for i in 10 ..= 11 {
+				blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
+			}
+			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[12])
+		case _aes.ROUNDS_256:
+			#unroll for i in 10 ..= 13 {
+				blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
+			}
+			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[14])
+		}
+
+		if l == BLOCK_SIZE {
+			xor_blocks_hw(dst, src, blks[:1])
+		} else {
+			blk: [BLOCK_SIZE]byte
+			copy(blk[:], src)
+			xor_blocks_hw(blk[:], blk[:], blks[:1])
+			copy(dst, blk[:l])
+		}
+		if is_seal {
+			hw_intel.ghash(s[:], h[:], dst[:l])
+		}
+
+		dst = dst[l:]
+		src = src[l:]
+		n -= l
+	}
+
+	mem.zero_explicit(&blks, size_of(blks))
+	mem.zero_explicit(&sks, size_of(sks))
+}
+
+// BUG: Sticking this in gctr_hw (like the other implementations) crashes
+// the compiler.
+//
+// src/check_expr.cpp(7892): Assertion Failure: `c->curr_proc_decl->entity`
+@(private = "file", enable_target_feature = "sse4.1")
+hw_inc_ctr32 :: #force_inline proc "contextless" (src: ^x86.__m128i, ctr: u32) -> (x86.__m128i, u32) {
+	ret := x86._mm_insert_epi32(src^, i32(intrinsics.byte_swap(ctr)), 3)
+	return ret, ctr + 1
+}
@@ -10,6 +10,10 @@ Context_Impl :: union {
 	Context_Impl_Hardware,
 }

+// DEFAULT_IMPLEMENTATION is the implementation that will be used by
+// default if possible.
+DEFAULT_IMPLEMENTATION :: Implementation.Hardware
+
 // Implementation is an AES implementation.  Most callers will not need
 // to use this as the package will automatically select the most performant
 // implementation available (See `is_hardware_accelerated()`).
@@ -1,3 +1,4 @@
+//+build !amd64
 package aes

@(private = "file")
@@ -33,11 +34,11 @@ ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) {
 }

@(private)
-gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, nonce, aad, plaintext: []byte) {
+gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, iv, aad, plaintext: []byte) {
 	panic(ERR_HW_NOT_SUPPORTED)
 }

@(private)
-gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, nonce, aad, ciphertext, tag: []byte) -> bool {
+gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, iv, aad, ciphertext, tag: []byte) -> bool {
 	panic(ERR_HW_NOT_SUPPORTED)
 }
@@ -0,0 +1,18 @@
+//+build amd64
+package aes
+
+import "core:crypto/_aes/hw_intel"
+
+// is_hardware_accelerated returns true iff hardware accelerated AES
+// is supported.
+is_hardware_accelerated :: proc "contextless" () -> bool {
+	return hw_intel.is_supported()
+}
+
+@(private)
+Context_Impl_Hardware :: hw_intel.Context
+
+@(private, enable_target_feature = "sse2,aes")
+init_impl_hw :: proc(ctx: ^Context_Impl_Hardware, key: []byte) {
+	hw_intel.init(ctx, key)
+}
@@ -7,134 +7,84 @@ See:
 */
 package chacha20

-import "core:encoding/endian"
-import "core:math/bits"
+import "core:bytes"
+import "core:crypto/_chacha20"
 import "core:mem"

 // KEY_SIZE is the (X)ChaCha20 key size in bytes.
-KEY_SIZE :: 32
-// NONCE_SIZE is the ChaCha20 nonce size in bytes.
-NONCE_SIZE :: 12
-// XNONCE_SIZE is the XChaCha20 nonce size in bytes.
-XNONCE_SIZE :: 24
-
-@(private)
-_MAX_CTR_IETF :: 0xffffffff
-
-@(private)
-_BLOCK_SIZE :: 64
-@(private)
-_STATE_SIZE_U32 :: 16
-@(private)
-_ROUNDS :: 20
-
-@(private)
-_SIGMA_0: u32 : 0x61707865
-@(private)
-_SIGMA_1: u32 : 0x3320646e
-@(private)
-_SIGMA_2: u32 : 0x79622d32
-@(private)
-_SIGMA_3: u32 : 0x6b206574
+KEY_SIZE :: _chacha20.KEY_SIZE
+// IV_SIZE is the ChaCha20 IV size in bytes.
+IV_SIZE :: _chacha20.IV_SIZE
+// XIV_SIZE is the XChaCha20 IV size in bytes.
+XIV_SIZE :: _chacha20.XIV_SIZE

 // Context is a ChaCha20 or XChaCha20 instance.
 Context :: struct {
-	_s:              [_STATE_SIZE_U32]u32,
-	_buffer:         [_BLOCK_SIZE]byte,
-	_off:            int,
-	_is_ietf_flavor: bool,
-	_is_initialized: bool,
+	_state: _chacha20.Context,
+	_impl:  Implementation,
 }

 // init inititializes a Context for ChaCha20 or XChaCha20 with the provided
-// key and nonce.
-init :: proc(ctx: ^Context, key, nonce: []byte) {
+// key and iv.
+init :: proc(ctx: ^Context, key, iv: []byte, impl := DEFAULT_IMPLEMENTATION) {
 	if len(key) != KEY_SIZE {
-		panic("crypto/chacha20: invalid ChaCha20 key size")
+		panic("crypto/chacha20: invalid (X)ChaCha20 key size")
 	}
-	if n_len := len(nonce); n_len != NONCE_SIZE && n_len != XNONCE_SIZE {
-		panic("crypto/chacha20: invalid (X)ChaCha20 nonce size")
+	if l := len(iv); l != IV_SIZE && l != XIV_SIZE {
+		panic("crypto/chacha20: invalid (X)ChaCha20 IV size")
 	}

-	k, n := key, nonce
+	k, n := key, iv

-	// Derive the XChaCha20 subkey and sub-nonce via HChaCha20.
-	is_xchacha := len(nonce) == XNONCE_SIZE
+	init_impl(ctx, impl)
+
+	is_xchacha := len(iv) == XIV_SIZE
 	if is_xchacha {
-		sub_key := ctx._buffer[:KEY_SIZE]
-		_hchacha20(sub_key, k, n)
+		sub_iv: [IV_SIZE]byte
+		sub_key := ctx._state._buffer[:KEY_SIZE]
+		hchacha20(sub_key, k, n, ctx._impl)
 		k = sub_key
-		n = n[16:24]
+		copy(sub_iv[4:], n[16:])
+		n = sub_iv[:]
 	}

-	ctx._s[0] = _SIGMA_0
-	ctx._s[1] = _SIGMA_1
-	ctx._s[2] = _SIGMA_2
-	ctx._s[3] = _SIGMA_3
-	ctx._s[4] = endian.unchecked_get_u32le(k[0:4])
-	ctx._s[5] = endian.unchecked_get_u32le(k[4:8])
-	ctx._s[6] = endian.unchecked_get_u32le(k[8:12])
-	ctx._s[7] = endian.unchecked_get_u32le(k[12:16])
-	ctx._s[8] = endian.unchecked_get_u32le(k[16:20])
-	ctx._s[9] = endian.unchecked_get_u32le(k[20:24])
-	ctx._s[10] = endian.unchecked_get_u32le(k[24:28])
-	ctx._s[11] = endian.unchecked_get_u32le(k[28:32])
-	ctx._s[12] = 0
-	if !is_xchacha {
-		ctx._s[13] = endian.unchecked_get_u32le(n[0:4])
-		ctx._s[14] = endian.unchecked_get_u32le(n[4:8])
-		ctx._s[15] = endian.unchecked_get_u32le(n[8:12])
-	} else {
-		ctx._s[13] = 0
-		ctx._s[14] = endian.unchecked_get_u32le(n[0:4])
-		ctx._s[15] = endian.unchecked_get_u32le(n[4:8])
+	_chacha20.init(&ctx._state, k, n, is_xchacha)

+	if is_xchacha {
 		// The sub-key is stored in the keystream buffer.  While
 		// this will be overwritten in most circumstances, explicitly
 		// clear it out early.
-		mem.zero_explicit(&ctx._buffer, KEY_SIZE)
+		mem.zero_explicit(&ctx._state._buffer, KEY_SIZE)
 	}
-
-	ctx._off = _BLOCK_SIZE
-	ctx._is_ietf_flavor = !is_xchacha
-	ctx._is_initialized = true
 }

 // seek seeks the (X)ChaCha20 stream counter to the specified block.
 seek :: proc(ctx: ^Context, block_nr: u64) {
-	assert(ctx._is_initialized)
-
-	if ctx._is_ietf_flavor {
-		if block_nr > _MAX_CTR_IETF {
-			panic("crypto/chacha20: attempted to seek past maximum counter")
-		}
-	} else {
-		ctx._s[13] = u32(block_nr >> 32)
-	}
-	ctx._s[12] = u32(block_nr)
-	ctx._off = _BLOCK_SIZE
+	_chacha20.seek(&ctx._state, block_nr)
 }

 // xor_bytes XORs each byte in src with bytes taken from the (X)ChaCha20
 // keystream, and writes the resulting output to dst.  Dst and src MUST
 // alias exactly or not at all.
 xor_bytes :: proc(ctx: ^Context, dst, src: []byte) {
-	assert(ctx._is_initialized)
-
-	// TODO: Enforcing that dst and src alias exactly or not at all
-	// is a good idea, though odd aliasing should be extremely uncommon.
+	assert(ctx._state._is_initialized)

 	src, dst := src, dst
 	if dst_len := len(dst); dst_len < len(src) {
 		src = src[:dst_len]
 	}

-	for remaining := len(src); remaining > 0; {
+	if bytes.alias_inexactly(dst, src) {
+		panic("crypto/chacha20: dst and src alias inexactly")
+	}
+
+	st := &ctx._state
+	#no_bounds_check for remaining := len(src); remaining > 0; {
 		// Process multiple blocks at once
-		if ctx._off == _BLOCK_SIZE {
-			if nr_blocks := remaining / _BLOCK_SIZE; nr_blocks > 0 {
-				direct_bytes := nr_blocks * _BLOCK_SIZE
-				_do_blocks(ctx, dst, src, nr_blocks)
+		if st._off == _chacha20.BLOCK_SIZE {
+			if nr_blocks := remaining / _chacha20.BLOCK_SIZE; nr_blocks > 0 {
+				direct_bytes := nr_blocks * _chacha20.BLOCK_SIZE
+				stream_blocks(ctx, dst, src, nr_blocks)
 				remaining -= direct_bytes
 				if remaining == 0 {
 					return
@@ -145,17 +95,17 @@ xor_bytes :: proc(ctx: ^Context, dst, src: []byte) {

 			// If there is a partial block, generate and buffer 1 block
 			// worth of keystream.
-			_do_blocks(ctx, ctx._buffer[:], nil, 1)
-			ctx._off = 0
+			stream_blocks(ctx, st._buffer[:], nil, 1)
+			st._off = 0
 		}

 		// Process partial blocks from the buffered keystream.
-		to_xor := min(_BLOCK_SIZE - ctx._off, remaining)
-		buffered_keystream := ctx._buffer[ctx._off:]
+		to_xor := min(_chacha20.BLOCK_SIZE - st._off, remaining)
+		buffered_keystream := st._buffer[st._off:]
 		for i := 0; i < to_xor; i = i + 1 {
 			dst[i] = buffered_keystream[i] ~ src[i]
 		}
-		ctx._off += to_xor
+		st._off += to_xor
 		dst = dst[to_xor:]
 		src = src[to_xor:]
 		remaining -= to_xor
@@ -164,15 +114,15 @@ xor_bytes :: proc(ctx: ^Context, dst, src: []byte) {

 // keystream_bytes fills dst with the raw (X)ChaCha20 keystream output.
 keystream_bytes :: proc(ctx: ^Context, dst: []byte) {
-	assert(ctx._is_initialized)
+	assert(ctx._state._is_initialized)

-	dst := dst
-	for remaining := len(dst); remaining > 0; {
+	dst, st := dst, &ctx._state
+	#no_bounds_check for remaining := len(dst); remaining > 0; {
 		// Process multiple blocks at once
-		if ctx._off == _BLOCK_SIZE {
-			if nr_blocks := remaining / _BLOCK_SIZE; nr_blocks > 0 {
-				direct_bytes := nr_blocks * _BLOCK_SIZE
-				_do_blocks(ctx, dst, nil, nr_blocks)
+		if st._off == _chacha20.BLOCK_SIZE {
+			if nr_blocks := remaining / _chacha20.BLOCK_SIZE; nr_blocks > 0 {
+				direct_bytes := nr_blocks * _chacha20.BLOCK_SIZE
+				stream_blocks(ctx, dst, nil, nr_blocks)
 				remaining -= direct_bytes
 				if remaining == 0 {
 					return
@@ -182,15 +132,15 @@ keystream_bytes :: proc(ctx: ^Context, dst: []byte) {

 			// If there is a partial block, generate and buffer 1 block
 			// worth of keystream.
-			_do_blocks(ctx, ctx._buffer[:], nil, 1)
-			ctx._off = 0
+			stream_blocks(ctx, st._buffer[:], nil, 1)
+			st._off = 0
 		}

 		// Process partial blocks from the buffered keystream.
-		to_copy := min(_BLOCK_SIZE - ctx._off, remaining)
-		buffered_keystream := ctx._buffer[ctx._off:]
+		to_copy := min(_chacha20.BLOCK_SIZE - st._off, remaining)
+		buffered_keystream := st._buffer[st._off:]
 		copy(dst[:to_copy], buffered_keystream[:to_copy])
-		ctx._off += to_copy
+		st._off += to_copy
 		dst = dst[to_copy:]
 		remaining -= to_copy
 	}
@@ -199,366 +149,5 @@ keystream_bytes :: proc(ctx: ^Context, dst: []byte) {
 // reset sanitizes the Context.  The Context must be re-initialized to
 // be used again.
 reset :: proc(ctx: ^Context) {
-	mem.zero_explicit(&ctx._s, size_of(ctx._s))
-	mem.zero_explicit(&ctx._buffer, size_of(ctx._buffer))
-
-	ctx._is_initialized = false
-}
-
-@(private)
-_do_blocks :: proc(ctx: ^Context, dst, src: []byte, nr_blocks: int) {
-	// Enforce the maximum consumed keystream per nonce.
-	//
-	// While all modern "standard" definitions of ChaCha20 use
-	// the IETF 32-bit counter, for XChaCha20 most common
-	// implementations allow for a 64-bit counter.
-	//
-	// Honestly, the answer here is "use a MRAE primitive", but
-	// go with common practice in the case of XChaCha20.
-	if ctx._is_ietf_flavor {
-		if u64(ctx._s[12]) + u64(nr_blocks) > 0xffffffff {
-			panic("crypto/chacha20: maximum ChaCha20 keystream per nonce reached")
-		}
-	} else {
-		ctr := (u64(ctx._s[13]) << 32) | u64(ctx._s[12])
-		if _, carry := bits.add_u64(ctr, u64(nr_blocks), 0); carry != 0 {
-			panic("crypto/chacha20: maximum XChaCha20 keystream per nonce reached")
-		}
-	}
-
-	dst, src := dst, src
-	x := &ctx._s
-	for n := 0; n < nr_blocks; n = n + 1 {
-		x0, x1, x2, x3 := _SIGMA_0, _SIGMA_1, _SIGMA_2, _SIGMA_3
-		x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
-
-		for i := _ROUNDS; i > 0; i = i - 2 {
-			// Even when forcing inlining manually inlining all of
-			// these is decently faster.
-
-			// quarterround(x, 0, 4, 8, 12)
-			x0 += x4
-			x12 ~= x0
-			x12 = bits.rotate_left32(x12, 16)
-			x8 += x12
-			x4 ~= x8
-			x4 = bits.rotate_left32(x4, 12)
-			x0 += x4
-			x12 ~= x0
-			x12 = bits.rotate_left32(x12, 8)
-			x8 += x12
-			x4 ~= x8
-			x4 = bits.rotate_left32(x4, 7)
-
-			// quarterround(x, 1, 5, 9, 13)
-			x1 += x5
-			x13 ~= x1
-			x13 = bits.rotate_left32(x13, 16)
-			x9 += x13
-			x5 ~= x9
-			x5 = bits.rotate_left32(x5, 12)
-			x1 += x5
-			x13 ~= x1
-			x13 = bits.rotate_left32(x13, 8)
-			x9 += x13
-			x5 ~= x9
-			x5 = bits.rotate_left32(x5, 7)
-
-			// quarterround(x, 2, 6, 10, 14)
-			x2 += x6
-			x14 ~= x2
-			x14 = bits.rotate_left32(x14, 16)
-			x10 += x14
-			x6 ~= x10
-			x6 = bits.rotate_left32(x6, 12)
-			x2 += x6
-			x14 ~= x2
-			x14 = bits.rotate_left32(x14, 8)
-			x10 += x14
-			x6 ~= x10
-			x6 = bits.rotate_left32(x6, 7)
-
-			// quarterround(x, 3, 7, 11, 15)
-			x3 += x7
-			x15 ~= x3
-			x15 = bits.rotate_left32(x15, 16)
-			x11 += x15
-			x7 ~= x11
-			x7 = bits.rotate_left32(x7, 12)
-			x3 += x7
-			x15 ~= x3
-			x15 = bits.rotate_left32(x15, 8)
-			x11 += x15
-			x7 ~= x11
-			x7 = bits.rotate_left32(x7, 7)
-
-			// quarterround(x, 0, 5, 10, 15)
-			x0 += x5
-			x15 ~= x0
-			x15 = bits.rotate_left32(x15, 16)
-			x10 += x15
-			x5 ~= x10
-			x5 = bits.rotate_left32(x5, 12)
-			x0 += x5
-			x15 ~= x0
-			x15 = bits.rotate_left32(x15, 8)
-			x10 += x15
-			x5 ~= x10
-			x5 = bits.rotate_left32(x5, 7)
-
-			// quarterround(x, 1, 6, 11, 12)
-			x1 += x6
-			x12 ~= x1
-			x12 = bits.rotate_left32(x12, 16)
-			x11 += x12
-			x6 ~= x11
-			x6 = bits.rotate_left32(x6, 12)
-			x1 += x6
-			x12 ~= x1
-			x12 = bits.rotate_left32(x12, 8)
-			x11 += x12
-			x6 ~= x11
-			x6 = bits.rotate_left32(x6, 7)
-
-			// quarterround(x, 2, 7, 8, 13)
-			x2 += x7
-			x13 ~= x2
-			x13 = bits.rotate_left32(x13, 16)
-			x8 += x13
-			x7 ~= x8
-			x7 = bits.rotate_left32(x7, 12)
-			x2 += x7
-			x13 ~= x2
-			x13 = bits.rotate_left32(x13, 8)
-			x8 += x13
-			x7 ~= x8
-			x7 = bits.rotate_left32(x7, 7)
-
-			// quarterround(x, 3, 4, 9, 14)
-			x3 += x4
-			x14 ~= x3
-			x14 = bits.rotate_left32(x14, 16)
-			x9 += x14
-			x4 ~= x9
-			x4 = bits.rotate_left32(x4, 12)
-			x3 += x4
-			x14 ~= x3
-			x14 = bits.rotate_left32(x14, 8)
-			x9 += x14
-			x4 ~= x9
-			x4 = bits.rotate_left32(x4, 7)
-		}
-
-		x0 += _SIGMA_0
-		x1 += _SIGMA_1
-		x2 += _SIGMA_2
-		x3 += _SIGMA_3
-		x4 += x[4]
-		x5 += x[5]
-		x6 += x[6]
-		x7 += x[7]
-		x8 += x[8]
-		x9 += x[9]
-		x10 += x[10]
-		x11 += x[11]
-		x12 += x[12]
-		x13 += x[13]
-		x14 += x[14]
-		x15 += x[15]
-
-		// While the "correct" answer to getting more performance out of
-		// this is "use vector operations", support for that is currently
-		// a work in progress/to be designed.
-		//
-		// In the meantime:
-		// - The caller(s) ensure that src/dst are valid.
-		// - The compiler knows if the target is picky about alignment.
-
-		#no_bounds_check {
-			if src != nil {
-				endian.unchecked_put_u32le(dst[0:4], endian.unchecked_get_u32le(src[0:4]) ~ x0)
-				endian.unchecked_put_u32le(dst[4:8], endian.unchecked_get_u32le(src[4:8]) ~ x1)
-				endian.unchecked_put_u32le(dst[8:12], endian.unchecked_get_u32le(src[8:12]) ~ x2)
-				endian.unchecked_put_u32le(dst[12:16], endian.unchecked_get_u32le(src[12:16]) ~ x3)
-				endian.unchecked_put_u32le(dst[16:20], endian.unchecked_get_u32le(src[16:20]) ~ x4)
-				endian.unchecked_put_u32le(dst[20:24], endian.unchecked_get_u32le(src[20:24]) ~ x5)
-				endian.unchecked_put_u32le(dst[24:28], endian.unchecked_get_u32le(src[24:28]) ~ x6)
-				endian.unchecked_put_u32le(dst[28:32], endian.unchecked_get_u32le(src[28:32]) ~ x7)
-				endian.unchecked_put_u32le(dst[32:36], endian.unchecked_get_u32le(src[32:36]) ~ x8)
-				endian.unchecked_put_u32le(dst[36:40], endian.unchecked_get_u32le(src[36:40]) ~ x9)
-				endian.unchecked_put_u32le(dst[40:44], endian.unchecked_get_u32le(src[40:44]) ~ x10)
-				endian.unchecked_put_u32le(dst[44:48], endian.unchecked_get_u32le(src[44:48]) ~ x11)
-				endian.unchecked_put_u32le(dst[48:52], endian.unchecked_get_u32le(src[48:52]) ~ x12)
-				endian.unchecked_put_u32le(dst[52:56], endian.unchecked_get_u32le(src[52:56]) ~ x13)
-				endian.unchecked_put_u32le(dst[56:60], endian.unchecked_get_u32le(src[56:60]) ~ x14)
-				endian.unchecked_put_u32le(dst[60:64], endian.unchecked_get_u32le(src[60:64]) ~ x15)
-				src = src[_BLOCK_SIZE:]
-			} else {
-				endian.unchecked_put_u32le(dst[0:4], x0)
-				endian.unchecked_put_u32le(dst[4:8], x1)
-				endian.unchecked_put_u32le(dst[8:12], x2)
-				endian.unchecked_put_u32le(dst[12:16], x3)
-				endian.unchecked_put_u32le(dst[16:20], x4)
-				endian.unchecked_put_u32le(dst[20:24], x5)
-				endian.unchecked_put_u32le(dst[24:28], x6)
-				endian.unchecked_put_u32le(dst[28:32], x7)
-				endian.unchecked_put_u32le(dst[32:36], x8)
-				endian.unchecked_put_u32le(dst[36:40], x9)
-				endian.unchecked_put_u32le(dst[40:44], x10)
-				endian.unchecked_put_u32le(dst[44:48], x11)
-				endian.unchecked_put_u32le(dst[48:52], x12)
-				endian.unchecked_put_u32le(dst[52:56], x13)
-				endian.unchecked_put_u32le(dst[56:60], x14)
-				endian.unchecked_put_u32le(dst[60:64], x15)
-			}
-			dst = dst[_BLOCK_SIZE:]
-		}
-
-		// Increment the counter.  Overflow checking is done upon
-		// entry into the routine, so a 64-bit increment safely
-		// covers both cases.
-		new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + 1
-		x[12] = u32(new_ctr)
-		x[13] = u32(new_ctr >> 32)
-	}
-}
-
-@(private)
-_hchacha20 :: proc "contextless" (dst, key, nonce: []byte) {
-	x0, x1, x2, x3 := _SIGMA_0, _SIGMA_1, _SIGMA_2, _SIGMA_3
-	x4 := endian.unchecked_get_u32le(key[0:4])
-	x5 := endian.unchecked_get_u32le(key[4:8])
-	x6 := endian.unchecked_get_u32le(key[8:12])
-	x7 := endian.unchecked_get_u32le(key[12:16])
-	x8 := endian.unchecked_get_u32le(key[16:20])
-	x9 := endian.unchecked_get_u32le(key[20:24])
-	x10 := endian.unchecked_get_u32le(key[24:28])
-	x11 := endian.unchecked_get_u32le(key[28:32])
-	x12 := endian.unchecked_get_u32le(nonce[0:4])
-	x13 := endian.unchecked_get_u32le(nonce[4:8])
-	x14 := endian.unchecked_get_u32le(nonce[8:12])
-	x15 := endian.unchecked_get_u32le(nonce[12:16])
-
-	for i := _ROUNDS; i > 0; i = i - 2 {
-		// quarterround(x, 0, 4, 8, 12)
-		x0 += x4
-		x12 ~= x0
-		x12 = bits.rotate_left32(x12, 16)
-		x8 += x12
-		x4 ~= x8
-		x4 = bits.rotate_left32(x4, 12)
-		x0 += x4
-		x12 ~= x0
-		x12 = bits.rotate_left32(x12, 8)
-		x8 += x12
-		x4 ~= x8
-		x4 = bits.rotate_left32(x4, 7)
-
-		// quarterround(x, 1, 5, 9, 13)
-		x1 += x5
-		x13 ~= x1
-		x13 = bits.rotate_left32(x13, 16)
-		x9 += x13
-		x5 ~= x9
-		x5 = bits.rotate_left32(x5, 12)
-		x1 += x5
-		x13 ~= x1
-		x13 = bits.rotate_left32(x13, 8)
-		x9 += x13
-		x5 ~= x9
-		x5 = bits.rotate_left32(x5, 7)
-
-		// quarterround(x, 2, 6, 10, 14)
-		x2 += x6
-		x14 ~= x2
-		x14 = bits.rotate_left32(x14, 16)
-		x10 += x14
-		x6 ~= x10
-		x6 = bits.rotate_left32(x6, 12)
-		x2 += x6
-		x14 ~= x2
-		x14 = bits.rotate_left32(x14, 8)
-		x10 += x14
-		x6 ~= x10
-		x6 = bits.rotate_left32(x6, 7)
-
-		// quarterround(x, 3, 7, 11, 15)
-		x3 += x7
-		x15 ~= x3
-		x15 = bits.rotate_left32(x15, 16)
-		x11 += x15
-		x7 ~= x11
-		x7 = bits.rotate_left32(x7, 12)
-		x3 += x7
-		x15 ~= x3
-		x15 = bits.rotate_left32(x15, 8)
-		x11 += x15
-		x7 ~= x11
-		x7 = bits.rotate_left32(x7, 7)
-
-		// quarterround(x, 0, 5, 10, 15)
-		x0 += x5
-		x15 ~= x0
-		x15 = bits.rotate_left32(x15, 16)
-		x10 += x15
-		x5 ~= x10
-		x5 = bits.rotate_left32(x5, 12)
-		x0 += x5
-		x15 ~= x0
-		x15 = bits.rotate_left32(x15, 8)
-		x10 += x15
-		x5 ~= x10
-		x5 = bits.rotate_left32(x5, 7)
-
-		// quarterround(x, 1, 6, 11, 12)
-		x1 += x6
-		x12 ~= x1
-		x12 = bits.rotate_left32(x12, 16)
-		x11 += x12
-		x6 ~= x11
-		x6 = bits.rotate_left32(x6, 12)
-		x1 += x6
-		x12 ~= x1
-		x12 = bits.rotate_left32(x12, 8)
-		x11 += x12
-		x6 ~= x11
-		x6 = bits.rotate_left32(x6, 7)
-
-		// quarterround(x, 2, 7, 8, 13)
-		x2 += x7
-		x13 ~= x2
-		x13 = bits.rotate_left32(x13, 16)
-		x8 += x13
-		x7 ~= x8
-		x7 = bits.rotate_left32(x7, 12)
-		x2 += x7
-		x13 ~= x2
-		x13 = bits.rotate_left32(x13, 8)
-		x8 += x13
-		x7 ~= x8
-		x7 = bits.rotate_left32(x7, 7)
-
-		// quarterround(x, 3, 4, 9, 14)
-		x3 += x4
-		x14 ~= x3
-		x14 = bits.rotate_left32(x14, 16)
-		x9 += x14
-		x4 ~= x9
-		x4 = bits.rotate_left32(x4, 12)
-		x3 += x4
-		x14 ~= x3
-		x14 = bits.rotate_left32(x14, 8)
-		x9 += x14
-		x4 ~= x9
-		x4 = bits.rotate_left32(x4, 7)
-	}
-
-	endian.unchecked_put_u32le(dst[0:4], x0)
-	endian.unchecked_put_u32le(dst[4:8], x1)
-	endian.unchecked_put_u32le(dst[8:12], x2)
-	endian.unchecked_put_u32le(dst[12:16], x3)
-	endian.unchecked_put_u32le(dst[16:20], x12)
-	endian.unchecked_put_u32le(dst[20:24], x13)
-	endian.unchecked_put_u32le(dst[24:28], x14)
-	endian.unchecked_put_u32le(dst[28:32], x15)
+	_chacha20.reset(&ctx._state)
 }
@@ -0,0 +1,56 @@
+package chacha20
+
+import "base:intrinsics"
+import "core:crypto/_chacha20/ref"
+import "core:crypto/_chacha20/simd128"
+import "core:crypto/_chacha20/simd256"
+
+// DEFAULT_IMPLEMENTATION is the implementation that will be used by
+// default if possible.
+DEFAULT_IMPLEMENTATION :: Implementation.Simd256
+
+// Implementation is a ChaCha20 implementation.  Most callers will not need
+// to use this as the package will automatically select the most performant
+// implementation available.
+Implementation :: enum {
+	Portable,
+	Simd128,
+	Simd256,
+}
+
+@(private)
+init_impl :: proc(ctx: ^Context, impl: Implementation) {
+	impl := impl
+	if impl == .Simd256 && !simd256.is_performant() {
+			impl = .Simd128
+	}
+	if impl == .Simd128 && !simd128.is_performant() {
+		impl = .Portable
+	}
+
+	ctx._impl = impl
+}
+
+@(private)
+stream_blocks :: proc(ctx: ^Context, dst, src: []byte, nr_blocks: int) {
+	switch ctx._impl {
+	case .Simd256:
+		simd256.stream_blocks(&ctx._state, dst, src, nr_blocks)
+	case .Simd128:
+		simd128.stream_blocks(&ctx._state, dst, src, nr_blocks)
+	case .Portable:
+		ref.stream_blocks(&ctx._state, dst, src, nr_blocks)
+	}
+}
+
+@(private)
+hchacha20 :: proc "contextless" (dst, key, iv: []byte, impl: Implementation) {
+	switch impl {
+	case .Simd256:
+		simd256.hchacha20(dst, key, iv)
+	case .Simd128:
+		simd128.hchacha20(dst, key, iv)
+	case .Portable:
+		ref.hchacha20(dst, key, iv)
+	}
+}
@@ -1,9 +1,11 @@
 /*
-package chacha20poly1305 implements the AEAD_CHACHA20_POLY1305 Authenticated
-Encryption with Additional Data algorithm.
+package chacha20poly1305 implements the AEAD_CHACHA20_POLY1305 and
+AEAD_XChaCha20_Poly1305 Authenticated Encryption with Additional Data
+algorithms.

 See:
 - https://www.rfc-editor.org/rfc/rfc8439
+- https://datatracker.ietf.org/doc/html/draft-arciszewski-xchacha-03
 */
 package chacha20poly1305

@@ -15,8 +17,10 @@ import "core:mem"

 // KEY_SIZE is the chacha20poly1305 key size in bytes.
 KEY_SIZE :: chacha20.KEY_SIZE
-// NONCE_SIZE is the chacha20poly1305 nonce size in bytes.
-NONCE_SIZE :: chacha20.NONCE_SIZE
+// IV_SIZE is the chacha20poly1305 IV size in bytes.
+IV_SIZE :: chacha20.IV_SIZE
+// XIV_SIZE is the xchacha20poly1305 IV size in bytes.
+XIV_SIZE :: chacha20.XIV_SIZE
 // TAG_SIZE is the chacha20poly1305 tag size in bytes.
 TAG_SIZE :: poly1305.TAG_SIZE

@@ -24,15 +28,13 @@ TAG_SIZE :: poly1305.TAG_SIZE
 _P_MAX :: 64 * 0xffffffff // 64 * (2^32-1)

@(private)
-_validate_common_slice_sizes :: proc (tag, key, nonce, aad, text: []byte) {
+_validate_common_slice_sizes :: proc (tag, iv, aad, text: []byte, is_xchacha: bool) {
 	if len(tag) != TAG_SIZE {
 		panic("crypto/chacha20poly1305: invalid destination tag size")
 	}
-	if len(key) != KEY_SIZE {
-		panic("crypto/chacha20poly1305: invalid key size")
-	}
-	if len(nonce) != NONCE_SIZE {
-		panic("crypto/chacha20poly1305: invalid nonce size")
+	expected_iv_len := is_xchacha ? XIV_SIZE : IV_SIZE
+	if len(iv) != expected_iv_len {
+		panic("crypto/chacha20poly1305: invalid IV size")
 	}

 	#assert(size_of(int) == 8 || size_of(int) <= 4)
@@ -59,18 +61,52 @@ _update_mac_pad16 :: #force_inline proc (ctx: ^poly1305.Context, x_len: int) {
 	}
 }

-// encrypt encrypts the plaintext and authenticates the aad and ciphertext,
-// with the provided key and nonce, stores the output in ciphertext and tag.
-encrypt :: proc (ciphertext, tag, key, nonce, aad, plaintext: []byte) {
-	_validate_common_slice_sizes(tag, key, nonce, aad, plaintext)
+// Context is a keyed (X)Chacha20Poly1305 instance.
+Context :: struct {
+	_key:            [KEY_SIZE]byte,
+	_impl:           chacha20.Implementation,
+	_is_xchacha:     bool,
+	_is_initialized: bool,
+}
+
+// init initializes a Context with the provided key, for AEAD_CHACHA20_POLY1305.
+init :: proc(ctx: ^Context, key: []byte, impl := chacha20.DEFAULT_IMPLEMENTATION) {
+	if len(key) != KEY_SIZE {
+		panic("crypto/chacha20poly1305: invalid key size")
+	}
+
+	copy(ctx._key[:], key)
+	ctx._impl = impl
+	ctx._is_xchacha = false
+	ctx._is_initialized = true
+}
+
+// init_xchacha initializes a Context with the provided key, for
+// AEAD_XChaCha20_Poly1305.
+//
+// Note: While there are multiple definitions of XChaCha20-Poly1305
+// this sticks to the IETF draft and uses a 32-bit counter.
+init_xchacha :: proc(ctx: ^Context, key: []byte, impl := chacha20.DEFAULT_IMPLEMENTATION) {
+	init(ctx, key, impl)
+	ctx._is_xchacha = true
+}
+
+// seal encrypts the plaintext and authenticates the aad and ciphertext,
+// with the provided Context and iv, stores the output in dst and tag.
+//
+// dst and plaintext MUST alias exactly or not at all.
+seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) {
+	ciphertext := dst
+	_validate_common_slice_sizes(tag, iv, aad, plaintext, ctx._is_xchacha)
 	if len(ciphertext) != len(plaintext) {
 		panic("crypto/chacha20poly1305: invalid destination ciphertext size")
 	}

 	stream_ctx: chacha20.Context = ---
-	chacha20.init(&stream_ctx, key, nonce)
+	chacha20.init(&stream_ctx, ctx._key[:],iv, ctx._impl)
+	stream_ctx._state._is_ietf_flavor = true

-	// otk = poly1305_key_gen(key, nonce)
+	// otk = poly1305_key_gen(key, iv)
 	otk: [poly1305.KEY_SIZE]byte = ---
 	chacha20.keystream_bytes(&stream_ctx, otk[:])
 	mac_ctx: poly1305.Context = ---
@@ -87,7 +123,7 @@ encrypt :: proc (ciphertext, tag, key, nonce, aad, plaintext: []byte) {
 	poly1305.update(&mac_ctx, aad)
 	_update_mac_pad16(&mac_ctx, aad_len)

-	// ciphertext = chacha20_encrypt(key, 1, nonce, plaintext)
+	// ciphertext = chacha20_encrypt(key, 1, iv, plaintext)
 	chacha20.seek(&stream_ctx, 1)
 	chacha20.xor_bytes(&stream_ctx, ciphertext, plaintext)
 	chacha20.reset(&stream_ctx) // Don't need the stream context anymore.
@@ -107,13 +143,16 @@ encrypt :: proc (ciphertext, tag, key, nonce, aad, plaintext: []byte) {
 	poly1305.final(&mac_ctx, tag) // Implicitly sanitizes context.
 }

-// decrypt authenticates the aad and ciphertext, and decrypts the ciphertext,
-// with the provided key, nonce, and tag, and stores the output in plaintext,
-// returning true iff the authentication was successful.
+// open authenticates the aad and ciphertext, and decrypts the ciphertext,
+// with the provided Context, iv, and tag, and stores the output in dst,
+// returning true iff the authentication was successful.  If authentication
+// fails, the destination buffer will be zeroed.
 //
-// If authentication fails, the destination plaintext buffer will be zeroed.
-decrypt :: proc (plaintext, tag, key, nonce, aad, ciphertext: []byte) -> bool {
-	_validate_common_slice_sizes(tag, key, nonce, aad, ciphertext)
+// dst and plaintext MUST alias exactly or not at all.
+@(require_results)
+open :: proc(ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool {
+	plaintext := dst
+	_validate_common_slice_sizes(tag, iv, aad, ciphertext, ctx._is_xchacha)
 	if len(ciphertext) != len(plaintext) {
 		panic("crypto/chacha20poly1305: invalid destination plaintext size")
 	}
@@ -123,9 +162,10 @@ decrypt :: proc (plaintext, tag, key, nonce, aad, ciphertext: []byte) -> bool {
 	// points where needed.

 	stream_ctx: chacha20.Context = ---
-	chacha20.init(&stream_ctx, key, nonce)
+	chacha20.init(&stream_ctx, ctx._key[:], iv, ctx._impl)
+	stream_ctx._state._is_ietf_flavor = true

-	// otk = poly1305_key_gen(key, nonce)
+	// otk = poly1305_key_gen(key, iv)
 	otk: [poly1305.KEY_SIZE]byte = ---
 	chacha20.keystream_bytes(&stream_ctx, otk[:])
 	defer chacha20.reset(&stream_ctx)
@@ -160,9 +200,17 @@ decrypt :: proc (plaintext, tag, key, nonce, aad, ciphertext: []byte) -> bool {
 		return false
 	}

-	// plaintext = chacha20_decrypt(key, 1, nonce, ciphertext)
+	// plaintext = chacha20_decrypt(key, 1, iv, ciphertext)
 	chacha20.seek(&stream_ctx, 1)
 	chacha20.xor_bytes(&stream_ctx, plaintext, ciphertext)

 	return true
 }
+
+// reset sanitizes the Context.  The Context must be
+// re-initialized to be used again.
+reset :: proc "contextless" (ctx: ^Context) {
+	mem.zero_explicit(&ctx._key, len(ctx._key))
+	ctx._is_xchacha = false
+	ctx._is_initialized = false
+}
@@ -60,7 +60,11 @@ rand_bytes :: proc (dst: []byte) {
 	_rand_bytes(dst)
 }

-
+// random_generator returns a `runtime.Random_Generator` backed by the
+// system entropy source.
+//
+// Support for the system entropy source can be checked with the
+// `HAS_RAND_BYTES` boolean constant.
 random_generator :: proc() -> runtime.Random_Generator {
 	return {
 		procedure = proc(data: rawptr, mode: runtime.Random_Generator_Mode, p: []byte) {
@@ -79,4 +83,4 @@ random_generator :: proc() -> runtime.Random_Generator {
 		},
 		data = nil,
 	}
-}
+}
@@ -21,7 +21,7 @@ PUBLIC_KEY_SIZE :: 32
 SIGNATURE_SIZE :: 64

@(private)
-NONCE_SIZE :: 32
+HDIGEST2_SIZE :: 32

 // Private_Key is an Ed25519 private key.
 Private_Key :: struct {
@@ -33,7 +33,7 @@ Private_Key :: struct {
 	// See: https://github.com/MystenLabs/ed25519-unsafe-libs
 	_b:              [PRIVATE_KEY_SIZE]byte,
 	_s:              grp.Scalar,
-	_nonce:          [NONCE_SIZE]byte,
+	_hdigest2:       [HDIGEST2_SIZE]byte,
 	_pub_key:        Public_Key,
 	_is_initialized: bool,
 }
@@ -63,7 +63,7 @@ private_key_set_bytes :: proc(priv_key: ^Private_Key, b: []byte) -> bool {
 	sha2.final(&ctx, h_bytes[:])

 	copy(priv_key._b[:], b)
-	copy(priv_key._nonce[:], h_bytes[32:])
+	copy(priv_key._hdigest2[:], h_bytes[32:])
 	grp.sc_set_bytes_rfc8032(&priv_key._s, h_bytes[:32])

 	// Derive the corresponding public key.
@@ -116,7 +116,7 @@ sign :: proc(priv_key: ^Private_Key, msg, sig: []byte) {
 	ctx: sha2.Context_512 = ---
 	digest_bytes: [sha2.DIGEST_SIZE_512]byte = ---
 	sha2.init_512(&ctx)
-	sha2.update(&ctx, priv_key._nonce[:])
+	sha2.update(&ctx, priv_key._hdigest2[:])
 	sha2.update(&ctx, msg)
 	sha2.final(&ctx, digest_bytes[:])

@@ -28,20 +28,26 @@ hash_bytes :: proc(algorithm: Algorithm, data: []byte, allocator := context.allo

 // hash_string_to_buffer will hash the given input and assign the
 // computed digest to the third parameter.  It requires that the
-// destination buffer is at least as big as the digest size.
-hash_string_to_buffer :: proc(algorithm: Algorithm, data: string, hash: []byte) {
-	hash_bytes_to_buffer(algorithm, transmute([]byte)(data), hash)
+// destination buffer is at least as big as the digest size.  The
+// provided destination buffer is returned to match the behavior of
+// `hash_string`.
+hash_string_to_buffer :: proc(algorithm: Algorithm, data: string, hash: []byte) -> []byte {
+	return hash_bytes_to_buffer(algorithm, transmute([]byte)(data), hash)
 }

 // hash_bytes_to_buffer will hash the given input and write the
 // computed digest into the third parameter.  It requires that the
-// destination buffer is at least as big as the digest size.
-hash_bytes_to_buffer :: proc(algorithm: Algorithm, data, hash: []byte) {
+// destination buffer is at least as big as the digest size.  The
+// provided destination buffer is returned to match the behavior of
+// `hash_bytes`.
+hash_bytes_to_buffer :: proc(algorithm: Algorithm, data, hash: []byte) -> []byte {
 	ctx: Context

 	init(&ctx, algorithm)
 	update(&ctx, data)
 	final(&ctx, hash)
+
+	return hash
 }

 // hash_stream will incrementally fully consume a stream, and return the
@@ -8,9 +8,9 @@ HAS_RAND_BYTES :: true

@(private)
 _rand_bytes :: proc(dst: []byte) {
-	ret := (os.Errno)(win32.BCryptGenRandom(nil, raw_data(dst), u32(len(dst)), win32.BCRYPT_USE_SYSTEM_PREFERRED_RNG))
-	if ret != os.ERROR_NONE {
-		switch ret {
+	ret := os.Platform_Error(win32.BCryptGenRandom(nil, raw_data(dst), u32(len(dst)), win32.BCRYPT_USE_SYSTEM_PREFERRED_RNG))
+	if ret != nil {
+		#partial switch ret {
 		case os.ERROR_INVALID_HANDLE:
 			// The handle to the first parameter is invalid.
 			// This should not happen here, since we explicitly pass nil to it
@@ -16,15 +16,12 @@ Library :: distinct rawptr
 Loads a dynamic library from the filesystem. The paramater `global_symbols` makes the symbols in the loaded
 library available to resolve references in subsequently loaded libraries.

-The paramater `global_symbols` is only used for the platforms `linux`, `darwin`, `freebsd` and `openbsd`.
+The parameter `global_symbols` is only used for the platforms `linux`, `darwin`, `freebsd` and `openbsd`.
 On `windows` this paramater is ignored.

 The underlying behaviour is platform specific.
 On `linux`, `darwin`, `freebsd` and `openbsd` refer to `dlopen`.
-On `windows` refer to `LoadLibraryW`.
-
-**Implicit Allocators**
-`context.temp_allocator`
+On `windows` refer to `LoadLibraryW`. Also temporarily needs an allocator to convert a string.

 Example:
 	import "core:dynlib"
@@ -79,10 +76,7 @@ Loads the address of a procedure/variable from a dynamic library.

 The underlying behaviour is platform specific.
 On `linux`, `darwin`, `freebsd` and `openbsd` refer to `dlsym`.
-On `windows` refer to `GetProcAddress`.
-
-**Implicit Allocators**
-`context.temp_allocator`
+On `windows` refer to `GetProcAddress`. Also temporarily needs an allocator to convert a string.

 Example:
 	import "core:dynlib"
@@ -177,9 +171,7 @@ initialize_symbols :: proc(
 	return count, count > 0
 }

-/*
-Returns an error message for the last failed procedure call.
-*/
+// Returns an error message for the last failed procedure call.
 last_error :: proc() -> string {
 	return _last_error()
-}
+}
@@ -16,4 +16,4 @@ _symbol_address :: proc(library: Library, symbol: string) -> (ptr: rawptr, found

 _last_error :: proc() -> string {
 	return ""
-}
+}
@@ -26,4 +26,4 @@ _symbol_address :: proc(library: Library, symbol: string) -> (ptr: rawptr, found
 _last_error :: proc() -> string {
 	err := os.dlerror()
 	return "unknown" if err == "" else err
-}
+}
@@ -4,14 +4,12 @@ package dynlib

 import win32 "core:sys/windows"
 import "core:strings"
-import "base:runtime"
 import "core:reflect"

-_load_library :: proc(path: string, global_symbols := false) -> (Library, bool) {
+_load_library :: proc(path: string, global_symbols := false, allocator := context.temp_allocator) -> (Library, bool) {
 	// NOTE(bill): 'global_symbols' is here only for consistency with POSIX which has RTLD_GLOBAL
-
-	runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD()
-	wide_path := win32.utf8_to_wstring(path, context.temp_allocator)
+	wide_path := win32.utf8_to_wstring(path, allocator)
+	defer free(wide_path, allocator)
 	handle := cast(Library)win32.LoadLibraryW(wide_path)
 	return handle, handle != nil
 }
@@ -21,9 +19,9 @@ _unload_library :: proc(library: Library) -> bool {
 	return bool(ok)
 }

-_symbol_address :: proc(library: Library, symbol: string) -> (ptr: rawptr, found: bool) {
-	runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD()
-	c_str := strings.clone_to_cstring(symbol, context.temp_allocator)
+_symbol_address :: proc(library: Library, symbol: string, allocator := context.temp_allocator) -> (ptr: rawptr, found: bool) {
+	c_str := strings.clone_to_cstring(symbol, allocator)
+	defer delete(c_str, allocator)
 	ptr = win32.GetProcAddress(cast(win32.HMODULE)library, c_str)
 	found = ptr != nil
 	return
@@ -33,4 +31,4 @@ _last_error :: proc() -> string {
 	err := win32.System_Error(win32.GetLastError())
 	err_msg := reflect.enum_string(err)
 	return "unknown" if err_msg == "" else err_msg
-}
+}
@@ -3,6 +3,7 @@ package encoding_cbor
 import "base:intrinsics"

 import "core:encoding/json"
+import "core:encoding/hex"
 import "core:io"
 import "core:mem"
 import "core:strconv"
@@ -399,11 +400,11 @@ to_diagnostic_format_writer :: proc(w: io.Writer, val: Value, padding := 0) -> i
 		io.write_string(w, str) or_return

 	case bool: io.write_string(w, "true" if v else "false") or_return
-	case Nil: io.write_string(w, "nil") or_return
+	case Nil: io.write_string(w, "null") or_return
 	case Undefined: io.write_string(w, "undefined") or_return
 	case ^Bytes:
 		io.write_string(w, "h'") or_return
-		for b in v { io.write_int(w, int(b), 16) or_return }
+		hex.encode_into_writer(w, v^) or_return
 		io.write_string(w, "'") or_return
 	case ^Text:
 		io.write_string(w, `"`) or_return
@@ -77,8 +77,11 @@ You can look at the default tags provided for pointers on how these implementati
 Example:
 	package main

+	import "base:intrinsics"
+
 	import "core:encoding/cbor"
 	import "core:fmt"
+	import "core:reflect"
 	import "core:time"

 	Possibilities :: union {
@@ -93,9 +96,32 @@ Example:
 		ignore_this: ^Data `cbor:"-"`,     // Ignored by implementation.
 		renamed: f32 `cbor:"renamed :)"`,  // Renamed when encoded.
 		my_union: Possibilities,           // Union support.
+
+		my_raw: [8]u32 `cbor_tag:"raw"`, // Custom tag that just writes the value as bytes.
 	}

 	main :: proc() {
+		// Example custom tag implementation that instead of breaking down all parts,
+		// just writes the value as a big byte blob. This is an advanced feature but very powerful.
+		RAW_TAG_NR :: 200
+		cbor.tag_register_number({
+			marshal = proc(_: ^cbor.Tag_Implementation, e: cbor.Encoder, v: any) -> cbor.Marshal_Error {
+				cbor._encode_u8(e.writer, RAW_TAG_NR, .Tag) or_return
+				return cbor.err_conv(cbor._encode_bytes(e, reflect.as_bytes(v)))
+			},
+			unmarshal = proc(_: ^cbor.Tag_Implementation, d: cbor.Decoder, _: cbor.Tag_Number, v: any) -> (cbor.Unmarshal_Error) {
+				hdr := cbor._decode_header(d.reader) or_return
+				maj, add := cbor._header_split(hdr)
+				if maj != .Bytes {
+					return .Bad_Tag_Value
+				}
+
+				bytes := cbor.err_conv(cbor._decode_bytes(d, add, maj)) or_return
+				intrinsics.mem_copy_non_overlapping(v.data, raw_data(bytes), len(bytes))
+				return nil
+			},
+		}, RAW_TAG_NR, "raw")
+
 		now := time.Time{_nsec = 1701117968 * 1e9}

 		data := Data{
@@ -105,21 +131,22 @@ Example:
 			ignore_this = &Data{},
 			renamed     = 123123.125,
 			my_union    = 3,
+			my_raw      = {1=1, 2=2, 3=3},
 		}
-		
+
 		// Marshal the struct into binary CBOR.
 		binary, err := cbor.marshal(data, cbor.ENCODE_FULLY_DETERMINISTIC)
-		assert(err == nil)
+		fmt.assertf(err == nil, "marshal error: %v", err)
 		defer delete(binary)
-		
+
 		// Decode the binary data into a `cbor.Value`.
 		decoded, derr := cbor.decode(string(binary))
-		assert(derr == nil)
+		fmt.assertf(derr == nil, "decode error: %v", derr)
 		defer cbor.destroy(decoded)

 		// Turn the CBOR into a human readable representation defined as the diagnostic format in [[RFC 8949 Section 8;https://www.rfc-editor.org/rfc/rfc8949.html#name-diagnostic-notation]].
 		diagnosis, eerr := cbor.to_diagnostic_format(decoded)
-		assert(eerr == nil)
+		fmt.assertf(eerr == nil, "to diagnostic error: %v", eerr)
 		defer delete(diagnosis)

 		fmt.println(diagnosis)
@@ -127,6 +154,7 @@ Example:

 Output:
 	{
+		"my_raw": 200(h'00001000200030000000000000000000'),
 		"my_union": 1010([
 			"int",
 			3
@@ -54,7 +54,7 @@ marshal_into_bytes :: proc(v: any, flags := ENCODE_SMALL, allocator := context.a

 	defer if err != nil { strings.builder_destroy(&b) }

-	if err = marshal_into_builder(&b, v, flags, temp_allocator, loc=loc); err != nil {
+	if err = marshal_into_builder(&b, v, flags, temp_allocator); err != nil {
 		return
 	}

@@ -63,20 +63,20 @@ marshal_into_bytes :: proc(v: any, flags := ENCODE_SMALL, allocator := context.a

 // Marshals the given value into a CBOR byte stream written to the given builder.
 // See docs on the `marshal_into` proc group for more info.
-marshal_into_builder :: proc(b: ^strings.Builder, v: any, flags := ENCODE_SMALL, temp_allocator := context.temp_allocator, loc := #caller_location) -> Marshal_Error {
-	return marshal_into_writer(strings.to_writer(b), v, flags, temp_allocator, loc=loc)
+marshal_into_builder :: proc(b: ^strings.Builder, v: any, flags := ENCODE_SMALL, temp_allocator := context.temp_allocator) -> Marshal_Error {
+	return marshal_into_writer(strings.to_writer(b), v, flags, temp_allocator)
 }

 // Marshals the given value into a CBOR byte stream written to the given writer.
 // See docs on the `marshal_into` proc group for more info.
-marshal_into_writer :: proc(w: io.Writer, v: any, flags := ENCODE_SMALL, temp_allocator := context.temp_allocator, loc := #caller_location) -> Marshal_Error {
+marshal_into_writer :: proc(w: io.Writer, v: any, flags := ENCODE_SMALL, temp_allocator := context.temp_allocator) -> Marshal_Error {
 	encoder := Encoder{flags, w, temp_allocator}
-	return marshal_into_encoder(encoder, v, loc=loc)
+	return marshal_into_encoder(encoder, v)
 }

 // Marshals the given value into a CBOR byte stream written to the given encoder.
 // See docs on the `marshal_into` proc group for more info.
-marshal_into_encoder :: proc(e: Encoder, v: any, loc :=  #caller_location) -> (err: Marshal_Error) {
+marshal_into_encoder :: proc(e: Encoder, v: any) -> (err: Marshal_Error) {
 	e := e

 	if e.temp_allocator.procedure == nil {
@@ -97,11 +97,14 @@ marshal_into_encoder :: proc(e: Encoder, v: any, loc :=  #caller_location) -> (e
 		return impl->marshal(e, v)
 	}

-	ti := runtime.type_info_base(type_info_of(v.id))
-	a := any{v.data, ti.id}
+	ti := runtime.type_info_core(type_info_of(v.id))
+	return _marshal_into_encoder(e, v, ti)
+}

+_marshal_into_encoder :: proc(e: Encoder, v: any, ti: ^runtime.Type_Info) -> (err: Marshal_Error) {
+	a := any{v.data, ti.id}
 	#partial switch info in ti.variant {
-	case runtime.Type_Info_Named:
+	case runtime.Type_Info_Named, runtime.Type_Info_Enum, runtime.Type_Info_Bit_Field:
 		unreachable()

 	case runtime.Type_Info_Pointer:
@@ -223,18 +226,38 @@ marshal_into_encoder :: proc(e: Encoder, v: any, loc :=  #caller_location) -> (e
 		}

 		err_conv(_encode_u64(e, u64(info.count), .Array)) or_return
+
+		if impl, ok := _tag_implementations_type[info.elem.id]; ok {
+			for i in 0..<info.count {
+				data := uintptr(v.data) + uintptr(i*info.elem_size)
+				impl->marshal(e, any{rawptr(data), info.elem.id}) or_return
+			}
+			return
+		}
+
+		elem_ti := runtime.type_info_core(type_info_of(info.elem.id))
 		for i in 0..<info.count {
 			data := uintptr(v.data) + uintptr(i*info.elem_size)
-			marshal_into(e, any{rawptr(data), info.elem.id}) or_return
+			_marshal_into_encoder(e, any{rawptr(data), info.elem.id}, elem_ti) or_return
 		}
 		return

 	case runtime.Type_Info_Enumerated_Array:
 		// index := runtime.type_info_base(info.index).variant.(runtime.Type_Info_Enum)
 		err_conv(_encode_u64(e, u64(info.count), .Array)) or_return
+
+		if impl, ok := _tag_implementations_type[info.elem.id]; ok {
+			for i in 0..<info.count {
+				data := uintptr(v.data) + uintptr(i*info.elem_size)
+				impl->marshal(e, any{rawptr(data), info.elem.id}) or_return
+			}
+			return
+		}
+
+		elem_ti := runtime.type_info_core(type_info_of(info.elem.id))
 		for i in 0..<info.count {
 			data := uintptr(v.data) + uintptr(i*info.elem_size)
-			marshal_into(e, any{rawptr(data), info.elem.id}) or_return
+			_marshal_into_encoder(e, any{rawptr(data), info.elem.id}, elem_ti) or_return
 		}
 		return
 		
@@ -246,9 +269,19 @@ marshal_into_encoder :: proc(e: Encoder, v: any, loc :=  #caller_location) -> (e

 		array := (^mem.Raw_Dynamic_Array)(v.data)
 		err_conv(_encode_u64(e, u64(array.len), .Array)) or_return
+
+		if impl, ok := _tag_implementations_type[info.elem.id]; ok {
+			for i in 0..<array.len {
+				data := uintptr(array.data) + uintptr(i*info.elem_size)
+				impl->marshal(e, any{rawptr(data), info.elem.id}) or_return
+			}
+			return
+		}
+
+		elem_ti := runtime.type_info_core(type_info_of(info.elem.id))
 		for i in 0..<array.len {
 			data := uintptr(array.data) + uintptr(i*info.elem_size)
-			marshal_into(e, any{rawptr(data), info.elem.id}) or_return
+			_marshal_into_encoder(e, any{rawptr(data), info.elem.id}, elem_ti) or_return
 		}
 		return

@@ -260,9 +293,19 @@ marshal_into_encoder :: proc(e: Encoder, v: any, loc :=  #caller_location) -> (e

 		array := (^mem.Raw_Slice)(v.data)
 		err_conv(_encode_u64(e, u64(array.len), .Array)) or_return
+
+		if impl, ok := _tag_implementations_type[info.elem.id]; ok {
+			for i in 0..<array.len {
+				data := uintptr(array.data) + uintptr(i*info.elem_size)
+				impl->marshal(e, any{rawptr(data), info.elem.id}) or_return
+			}
+			return
+		}
+
+		elem_ti := runtime.type_info_core(type_info_of(info.elem.id))
 		for i in 0..<array.len {
 			data := uintptr(array.data) + uintptr(i*info.elem_size)
-			marshal_into(e, any{rawptr(data), info.elem.id}) or_return
+			_marshal_into_encoder(e, any{rawptr(data), info.elem.id}, elem_ti) or_return
 		}
 		return

@@ -308,7 +351,8 @@ marshal_into_encoder :: proc(e: Encoder, v: any, loc :=  #caller_location) -> (e
 				builder := strings.builder_from_slice(res[:])
 				e.writer = strings.to_stream(&builder)

-				assert(_encode_u64(e, u64(len(str)), .Text) == nil)
+				err := _encode_u64(e, u64(len(str)), .Text)
+				assert(err == nil)
 				res[9] = u8(len(builder.buf))
 				assert(res[9] < 10)
 				return
@@ -437,9 +481,7 @@ marshal_into_encoder :: proc(e: Encoder, v: any, loc :=  #caller_location) -> (e
 			}
 		}

-		marshal_entry :: #force_inline proc(e: Encoder, info: runtime.Type_Info_Struct, v: any, name: string, i: int) -> Marshal_Error {
-			err_conv(_encode_text(e, name)) or_return
-
+		marshal_entry :: #force_inline proc(e: Encoder, info: runtime.Type_Info_Struct, v: any, i: int) -> Marshal_Error {
 			id := info.types[i].id
 			data := rawptr(uintptr(v.data) + info.offsets[i])
 			field_any := any{data, id}
@@ -463,7 +505,7 @@ marshal_into_encoder :: proc(e: Encoder, v: any, loc :=  #caller_location) -> (e
 		}
 		
 		n: u64; {
-			for _, i in info.names {
+			for _, i in info.names[:info.field_count] {
 				if field_name(info, i) != "-" {
 					n += 1
 				}
@@ -473,37 +515,41 @@ marshal_into_encoder :: proc(e: Encoder, v: any, loc :=  #caller_location) -> (e

 		if .Deterministic_Map_Sorting in e.flags {
 			Name :: struct {
-				name:  string,
+				name:  []byte,
 				field: int,
 			}
 			entries := make([dynamic]Name, 0, n, e.temp_allocator) or_return
 			defer delete(entries)

-			for _, i in info.names {
+			for _, i in info.names[:info.field_count] {
 				fname := field_name(info, i)
 				if fname == "-" {
 					continue
 				}

-				append(&entries, Name{fname, i}) or_return
+				key_builder := strings.builder_make(e.temp_allocator) or_return
+				err_conv(_encode_text(Encoder{e.flags, strings.to_stream(&key_builder), e.temp_allocator}, fname)) or_return
+				append(&entries, Name{key_builder.buf[:], i}) or_return
 			}

 			// Sort lexicographic on the bytes of the key.
 			slice.sort_by_cmp(entries[:], proc(a, b: Name) -> slice.Ordering {
-				return slice.Ordering(bytes.compare(transmute([]byte)a.name, transmute([]byte)b.name))
+				return slice.Ordering(bytes.compare(a.name, b.name))
 			})

 			for entry in entries {
-				marshal_entry(e, info, v, entry.name, entry.field) or_return
+				io.write_full(e.writer, entry.name) or_return
+				marshal_entry(e, info, v, entry.field) or_return
 			}
 		} else {
-			for _, i in info.names {
+			for _, i in info.names[:info.field_count] {
 				fname := field_name(info, i)
 				if fname == "-" {
 					continue
 				}

-				marshal_entry(e, info, v, fname, i) or_return
+				err_conv(_encode_text(e, fname)) or_return
+				marshal_entry(e, info, v, i) or_return
 			}
 		}
 		return
@@ -542,9 +588,6 @@ marshal_into_encoder :: proc(e: Encoder, v: any, loc :=  #caller_location) -> (e

 		return marshal_into(e, any{v.data, vti.id})

-	case runtime.Type_Info_Enum:
-		return marshal_into(e, any{v.data, info.base.id})
-
 	case runtime.Type_Info_Bit_Set:
 		// Store bit_set as big endian just like the protocol.
 		do_byte_swap := !reflect.bit_set_is_big_endian(v)
@@ -96,7 +96,8 @@ _unmarshal_value :: proc(d: Decoder, v: any, hdr: Header, allocator := context.a
 			ti = reflect.type_info_base(variant)
 			if !reflect.is_pointer_internally(variant) {
 				tag := any{rawptr(uintptr(v.data) + u.tag_offset), u.tag_type.id}
-				assert(_assign_int(tag, 1))
+				assigned := _assign_int(tag, 1)
+				assert(assigned)
 			}
 		}
 	}
@@ -520,9 +521,7 @@ _unmarshal_array :: proc(d: Decoder, v: any, ti: ^reflect.Type_Info, hdr: Header
 		return

 	case reflect.Type_Info_Array:
-		_, scap := err_conv(_decode_len_container(d, add)) or_return
-		length := min(scap, t.count)
-	
+		length, _ := err_conv(_decode_len_container(d, add)) or_return
 		if length > t.count {
 			return _unsupported(v, hdr)
 		}
@@ -534,9 +533,7 @@ _unmarshal_array :: proc(d: Decoder, v: any, ti: ^reflect.Type_Info, hdr: Header
 		return

 	case reflect.Type_Info_Enumerated_Array:
-		_, scap := err_conv(_decode_len_container(d, add)) or_return
-		length := min(scap, t.count)
-	
+		length, _ := err_conv(_decode_len_container(d, add)) or_return
 		if length > t.count {
 			return _unsupported(v, hdr)
 		}
@@ -548,9 +545,7 @@ _unmarshal_array :: proc(d: Decoder, v: any, ti: ^reflect.Type_Info, hdr: Header
 		return

 	case reflect.Type_Info_Complex:
-		_, scap := err_conv(_decode_len_container(d, add)) or_return
-		length := min(scap, 2)
-	
+		length, _ := err_conv(_decode_len_container(d, add)) or_return
 		if length > 2 {
 			return _unsupported(v, hdr)
 		}
@@ -570,9 +565,7 @@ _unmarshal_array :: proc(d: Decoder, v: any, ti: ^reflect.Type_Info, hdr: Header
 		return
 	
 	case reflect.Type_Info_Quaternion:
-		_, scap := err_conv(_decode_len_container(d, add)) or_return
-		length := min(scap, 4)
-	
+		length, _ := err_conv(_decode_len_container(d, add)) or_return
 		if length > 4 {
 			return _unsupported(v, hdr)
 		}
@@ -626,14 +619,14 @@ _unmarshal_map :: proc(d: Decoder, v: any, ti: ^reflect.Type_Info, hdr: Header,

 	#partial switch t in ti.variant {
 	case reflect.Type_Info_Struct:
-		if t.is_raw_union {
+		if .raw_union in t.flags {
 			return _unsupported(v, hdr)
 		}

 		length, _ := err_conv(_decode_len_container(d, add)) or_return
 		unknown := length == -1
 		fields := reflect.struct_fields_zipped(ti.id)
-	
+
 		for idx := 0; idx < len(fields) && (unknown || idx < length); idx += 1 {
 			// Decode key, keys can only be strings.
 			key: string
@@ -646,7 +639,7 @@ _unmarshal_map :: proc(d: Decoder, v: any, ti: ^reflect.Type_Info, hdr: Header,
 				key = keyv
 			}
 			defer delete(key, context.temp_allocator)
-			
+
 			// Find matching field.
 			use_field_idx := -1
 			{
@@ -13,13 +13,14 @@ iterate_csv_from_string :: proc(filename: string) {
 	r.reuse_record_buffer = true // Without it you have to each of the fields within it
 	defer csv.reader_destroy(&r)

-	if csv_data, ok := os.read_entire_file(filename); ok {
+	csv_data, ok := os.read_entire_file(filename)
+	if ok {
 		csv.reader_init_with_string(&r, string(csv_data))
-		defer delete(csv_data)
 	} else {
 		fmt.printfln("Unable to open file: %v", filename)
 		return
 	}
+	defer delete(csv_data)

 	for r, i, err in csv.iterator_next(&r) {
 		if err != nil { /* Do something with error */ }
@@ -38,9 +39,9 @@ iterate_csv_from_stream :: proc(filename: string) {
 	r.reuse_record_buffer = true // Without it you have to each of the fields within it
 	defer csv.reader_destroy(&r)

-	handle, errno := os.open(filename)
-	if errno != os.ERROR_NONE {
-		fmt.printfln("Error opening file: %v", filename)
+	handle, err := os.open(filename)
+	if err != nil {
+		fmt.eprintfln("Error opening file: %v", filename)
 		return
 	}
 	defer os.close(handle)
@@ -62,13 +63,14 @@ read_csv_from_string :: proc(filename: string) {
 	r.reuse_record_buffer = true // Without it you have to each of the fields within it
 	defer csv.reader_destroy(&r)

-	if csv_data, ok := os.read_entire_file(filename); ok {
+	csv_data, ok := os.read_entire_file(filename)
+	if ok {
 		csv.reader_init_with_string(&r, string(csv_data))
-		defer delete(csv_data)
 	} else {
 		fmt.printfln("Unable to open file: %v", filename)
 		return
 	}
+	defer delete(csv_data)

 	records, err := csv.read_all(&r)
 	if err != nil { /* Do something with CSV parse error */ }
@@ -1,5 +1,6 @@
 package encoding_hex

+import "core:io"
 import "core:strings"

 encode :: proc(src: []byte, allocator := context.allocator, loc := #caller_location) -> []byte #no_bounds_check {
@@ -14,6 +15,12 @@ encode :: proc(src: []byte, allocator := context.allocator, loc := #caller_locat
 	return dst
 }

+encode_into_writer :: proc(dst: io.Writer, src: []byte) -> io.Error {
+	for v in src {
+		io.write(dst, {HEXTABLE[v>>4], HEXTABLE[v&0x0f]}) or_return
+	}
+	return nil
+}

 decode :: proc(src: []byte, allocator := context.allocator, loc := #caller_location) -> (dst: []byte, ok: bool) #no_bounds_check {
 	if len(src) % 2 == 1 {
@@ -82,15 +82,17 @@ Map :: distinct map[string]map[string]string

 load_map_from_string :: proc(src: string, allocator: runtime.Allocator, options := DEFAULT_OPTIONS) -> (m: Map, err: runtime.Allocator_Error) {
 	unquote :: proc(val: string) -> (string, runtime.Allocator_Error) {
-		v, allocated, ok := strconv.unquote_string(val)
-		if !ok {
-			return strings.clone(val)
+		if len(val) > 0 && (val[0] == '"' || val[0] == '\'') {
+			v, allocated, ok := strconv.unquote_string(val)
+			if !ok {
+				return strings.clone(val)
+			}
+			if allocated {
+				return v, nil
+			}
+			return strings.clone(v), nil
 		}
-		if allocated {
-			return v, nil
-		}
-		return strings.clone(v)
-
+		return strings.clone(val)
 	}

 	context.allocator = allocator
@@ -121,7 +123,7 @@ load_map_from_path :: proc(path: string, allocator: runtime.Allocator, options :
 	data := os.read_entire_file(path, allocator) or_return
 	defer delete(data, allocator)
 	m, err = load_map_from_string(string(data), allocator, options)
-	ok = err != nil
+	ok = err == nil
 	defer if !ok {
 		delete_map(m)
 	}
@@ -142,6 +144,7 @@ delete_map :: proc(m: Map) {
 			delete(value, allocator)
 		}
 		delete(section)
+		delete(pairs)
 	}
 	delete(m)
 }
@@ -100,38 +100,7 @@ marshal_to_writer :: proc(w: io.Writer, v: any, opt: ^Marshal_Options) -> (err:

 	case runtime.Type_Info_Integer:
 		buf: [40]byte
-		u: u128
-		switch i in a {
-		case i8:      u = u128(i)
-		case i16:     u = u128(i)
-		case i32:     u = u128(i)
-		case i64:     u = u128(i)
-		case i128:    u = u128(i)
-		case int:     u = u128(i)
-		case u8:      u = u128(i)
-		case u16:     u = u128(i)
-		case u32:     u = u128(i)
-		case u64:     u = u128(i)
-		case u128:    u = u128(i)
-		case uint:    u = u128(i)
-		case uintptr: u = u128(i)
-
-		case i16le:  u = u128(i)
-		case i32le:  u = u128(i)
-		case i64le:  u = u128(i)
-		case u16le:  u = u128(i)
-		case u32le:  u = u128(i)
-		case u64le:  u = u128(i)
-		case u128le: u = u128(i)
-
-		case i16be:  u = u128(i)
-		case i32be:  u = u128(i)
-		case i64be:  u = u128(i)
-		case u16be:  u = u128(i)
-		case u32be:  u = u128(i)
-		case u64be:  u = u128(i)
-		case u128be: u = u128(i)
-		}
+		u := cast_any_int_to_u128(a)

 		s: string

@@ -310,7 +279,12 @@ marshal_to_writer :: proc(w: io.Writer, v: any, opt: ^Marshal_Options) -> (err:
 							case cstring: name = string(s)
 							}
 							opt_write_key(w, opt, name) or_return
-
+						case runtime.Type_Info_Integer:
+							buf: [40]byte
+							u := cast_any_int_to_u128(ka)
+							name = strconv.append_bits_128(buf[:], u, 10, info.signed, 8*kti.size, "0123456789", nil)
+							
+							opt_write_key(w, opt, name) or_return
 						case: return .Unsupported_Type
 						}
 					}
@@ -406,10 +380,15 @@ marshal_to_writer :: proc(w: io.Writer, v: any, opt: ^Marshal_Options) -> (err:
 			ti := runtime.type_info_base(type_info_of(v.id))
 			info := ti.variant.(runtime.Type_Info_Struct)
 			first_iteration := true
-			for name, i in info.names {
+			for name, i in info.names[:info.field_count] {
 				omitempty := false

 				json_name, extra := json_name_from_tag_value(reflect.struct_tag_get(reflect.Struct_Tag(info.tags[i]), "json"))
+
+				if json_name == "-" {
+					continue
+				}
+
 				for flag in strings.split_iterator(&extra, ",") {
 					switch flag {
 					case "omitempty":
@@ -657,3 +636,41 @@ opt_write_indentation :: proc(w: io.Writer, opt: ^Marshal_Options) -> (err: io.E

 	return
 }
+
+@(private)
+cast_any_int_to_u128 :: proc(any_int_value: any) -> u128 {
+	u: u128 = 0
+	switch i in any_int_value {
+	case i8:      u = u128(i)
+	case i16:     u = u128(i)
+	case i32:     u = u128(i)
+	case i64:     u = u128(i)
+	case i128:    u = u128(i)
+	case int:     u = u128(i)
+	case u8:      u = u128(i)
+	case u16:     u = u128(i)
+	case u32:     u = u128(i)
+	case u64:     u = u128(i)
+	case u128:    u = u128(i)
+	case uint:    u = u128(i)
+	case uintptr: u = u128(i)
+
+	case i16le:  u = u128(i)
+	case i32le:  u = u128(i)
+	case i64le:  u = u128(i)
+	case u16le:  u = u128(i)
+	case u32le:  u = u128(i)
+	case u64le:  u = u128(i)
+	case u128le: u = u128(i)
+
+	case i16be:  u = u128(i)
+	case i32be:  u = u128(i)
+	case i64be:  u = u128(i)
+	case u16be:  u = u128(i)
+	case u32be:  u = u128(i)
+	case u64be:  u = u128(i)
+	case u128be: u = u128(i)
+	}
+
+	return u
+}
@@ -363,12 +363,11 @@ unmarshal_object :: proc(p: ^Parser, v: any, end_token: Token_Kind) -> (err: Unm
 	}

 	v := v
-	v = reflect.any_base(v)
-	ti := type_info_of(v.id)
+	ti := reflect.type_info_base(type_info_of(v.id))
 	
 	#partial switch t in ti.variant {
 	case reflect.Type_Info_Struct:
-		if t.is_raw_union {
+		if .raw_union in t.flags {
 			return UNSUPPORTED_TYPE
 		}
 	
@@ -475,7 +474,7 @@ unmarshal_object :: proc(p: ^Parser, v: any, end_token: Token_Kind) -> (err: Unm
 		}
 		
 	case reflect.Type_Info_Map:
-		if !reflect.is_string(t.key) {
+		if !reflect.is_string(t.key) && !reflect.is_integer(t.key) {
 			return UNSUPPORTED_TYPE
 		}
 		raw_map := (^mem.Raw_Map)(v.data)
@@ -492,25 +491,39 @@ unmarshal_object :: proc(p: ^Parser, v: any, end_token: Token_Kind) -> (err: Unm
 			key, _ := parse_object_key(p, p.allocator)
 			unmarshal_expect_token(p, .Colon)
 			
-			
+
 			mem.zero_slice(elem_backing)
 			if uerr := unmarshal_value(p, map_backing_value); uerr != nil {
 				delete(key, p.allocator)
 				return uerr
 			}

-			key_ptr := rawptr(&key)
+			key_ptr: rawptr

-			key_cstr: cstring
-			if reflect.is_cstring(t.key) {
-				key_cstr = cstring(raw_data(key))
-				key_ptr = &key_cstr
+			#partial switch tk in t.key.variant {
+				case runtime.Type_Info_String:			
+					key_ptr = rawptr(&key)
+					key_cstr: cstring
+					if reflect.is_cstring(t.key) {
+						key_cstr = cstring(raw_data(key))
+						key_ptr = &key_cstr
+					}
+				case runtime.Type_Info_Integer:
+					i, ok := strconv.parse_i128(key)
+					if !ok	{ return UNSUPPORTED_TYPE }
+					key_ptr = rawptr(&i)
+				case: return UNSUPPORTED_TYPE
 			}
-			
+
 			set_ptr := runtime.__dynamic_map_set_without_hash(raw_map, t.map_info, key_ptr, map_backing_value.data)
 			if set_ptr == nil {
 				delete(key, p.allocator)
 			} 
+
+			// there's no need to keep string value on the heap, since it was copied into map 
+			if reflect.is_integer(t.key) {
+				delete(key, p.allocator)
+			}
 			
 			if parse_comma(p) {
 				break map_loop
@@ -11,7 +11,7 @@ Write a UUID in the 8-4-4-4-12 format.
 This procedure performs error checking with every byte written.

 If you can guarantee beforehand that your stream has enough space to hold the
-UUID (32 bytes), then it is better to use `unsafe_write` instead as that will
+UUID (36 bytes), then it is better to use `unsafe_write` instead as that will
 be faster.

 Inputs:
@@ -22,7 +22,7 @@ Returns:
 - error: An `io` error, if one occurred, otherwise `nil`.
 */
 write :: proc(w: io.Writer, id: Identifier) -> (error: io.Error) #no_bounds_check {
-	write_octet :: proc (w: io.Writer, octet: u8) -> io.Error #no_bounds_check {
+	write_octet :: proc(w: io.Writer, octet: u8) -> io.Error #no_bounds_check {
 		high_nibble := octet >> 4
 		low_nibble := octet & 0xF

@@ -31,15 +31,15 @@ write :: proc(w: io.Writer, id: Identifier) -> (error: io.Error) #no_bounds_chec
 		return nil
 	}

-	for index in  0 ..<  4 { write_octet(w, id[index]) or_return }
+	for index in 0 ..< 4 {write_octet(w, id[index]) or_return}
 	io.write_byte(w, '-') or_return
-	for index in  4 ..<  6 { write_octet(w, id[index]) or_return }
+	for index in 4 ..< 6 {write_octet(w, id[index]) or_return}
 	io.write_byte(w, '-') or_return
-	for index in  6 ..<  8 { write_octet(w, id[index]) or_return }
+	for index in 6 ..< 8 {write_octet(w, id[index]) or_return}
 	io.write_byte(w, '-') or_return
-	for index in  8 ..< 10 { write_octet(w, id[index]) or_return }
+	for index in 8 ..< 10 {write_octet(w, id[index]) or_return}
 	io.write_byte(w, '-') or_return
-	for index in 10 ..< 16 { write_octet(w, id[index]) or_return }
+	for index in 10 ..< 16 {write_octet(w, id[index]) or_return}

 	return nil
 }
@@ -54,7 +54,7 @@ Inputs:
 - id: The identifier to convert.
 */
 unsafe_write :: proc(w: io.Writer, id: Identifier) #no_bounds_check {
-	write_octet :: proc (w: io.Writer, octet: u8) #no_bounds_check {
+	write_octet :: proc(w: io.Writer, octet: u8) #no_bounds_check {
 		high_nibble := octet >> 4
 		low_nibble := octet & 0xF

@@ -62,15 +62,15 @@ unsafe_write :: proc(w: io.Writer, id: Identifier) #no_bounds_check {
 		io.write_byte(w, strconv.digits[low_nibble])
 	}

-	for index in  0 ..<  4 { write_octet(w, id[index]) }
+	for index in 0 ..< 4 {write_octet(w, id[index])}
 	io.write_byte(w, '-')
-	for index in  4 ..<  6 { write_octet(w, id[index]) }
+	for index in 4 ..< 6 {write_octet(w, id[index])}
 	io.write_byte(w, '-')
-	for index in  6 ..<  8 { write_octet(w, id[index]) }
+	for index in 6 ..< 8 {write_octet(w, id[index])}
 	io.write_byte(w, '-')
-	for index in  8 ..< 10 { write_octet(w, id[index]) }
+	for index in 8 ..< 10 {write_octet(w, id[index])}
 	io.write_byte(w, '-')
-	for index in 10 ..< 16 { write_octet(w, id[index]) }
+	for index in 10 ..< 16 {write_octet(w, id[index])}
 }

 /*
@@ -106,7 +106,7 @@ Convert a UUID to a string in the 8-4-4-4-12 format.

 Inputs:
 - id: The identifier to convert.
- buffer: A byte buffer to store the result. Must be at least 32 bytes large.
+- buffer: A byte buffer to store the result. Must be at least 36 bytes large.
 - loc: The caller location for debugging purposes (default: #caller_location)

 Returns:
@@ -119,7 +119,11 @@ to_string_buffer :: proc(
 ) -> (
 	str: string,
 ) {
-	assert(len(buffer) >= EXPECTED_LENGTH, "The buffer provided is not at least 32 bytes large.", loc)
+	assert(
+		len(buffer) >= EXPECTED_LENGTH,
+		"The buffer provided is not at least 36 bytes large.",
+		loc,
+	)
 	builder := strings.builder_from_bytes(buffer)
 	unsafe_write(strings.to_writer(&builder), id)
 	return strings.to_string(builder)
@@ -129,3 +133,4 @@ to_string :: proc {
 	to_string_allocated,
 	to_string_buffer,
 }
+
@@ -126,7 +126,7 @@ error :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) {
 	t.error_count += 1
 }

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 advance_rune :: proc(t: ^Tokenizer) {
 	#no_bounds_check {
 		/*
@@ -170,7 +170,7 @@ peek_byte :: proc(t: ^Tokenizer, offset := 0) -> byte {
 	return 0
 }

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 skip_whitespace :: proc(t: ^Tokenizer) {
 	for {
 		switch t.ch {
@@ -182,7 +182,7 @@ skip_whitespace :: proc(t: ^Tokenizer) {
 	}
 }

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 is_letter :: proc(r: rune) -> bool {
 	if r < utf8.RUNE_SELF {
 		switch r {
@@ -296,7 +296,7 @@ skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) {
 	return
 }

-@(optimization_mode="speed")
+@(optimization_mode="favor_size")
 scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close := false, multiline := true) -> (value: string, err: Error) {
 	err = .None

@@ -414,4 +414,4 @@ scan :: proc(t: ^Tokenizer, multiline_string := false) -> Token {
 		lit = string(t.src[offset : t.offset])
 	}
 	return Token{kind, lit, pos}
-}
+}
@@ -12,7 +12,7 @@ IMPORTING_TIME      :: #config(ODIN_CORE_FLAGS_USE_TIME, time.IS_SUPPORTED)

 // Override support for parsing `net` types.
 // TODO: Update this when the BSDs are supported.
-IMPORTING_NET       :: #config(ODIN_CORE_FLAGS_USE_NET, ODIN_OS == .Windows || ODIN_OS == .Linux || ODIN_OS == .Darwin)
+IMPORTING_NET       :: #config(ODIN_CORE_FLAGS_USE_NET, ODIN_OS == .Windows || ODIN_OS == .Linux || ODIN_OS == .Darwin || ODIN_OS == .FreeBSD)

 TAG_ARGS          :: "args"
 SUBTAG_NAME       :: "name"
@@ -28,7 +28,7 @@ Parse_Error :: struct {
 // Provides more granular information than what just a string could hold.
 Open_File_Error :: struct {
 	filename: string,
-	errno: os.Errno,
+	errno: os.Error,
 	mode: int,
 	perms: int,
 }
@@ -1,4 +1,4 @@
-//+build freebsd, netbsd, openbsd
+//+build netbsd, openbsd
 package flags

 import "base:runtime"
@@ -1,4 +1,4 @@
-//+build !freebsd !netbsd !openbsd
+//+build !netbsd !openbsd
 package flags

 import "base:runtime"
@@ -12,7 +12,7 @@ import "core:reflect"

 // Push a positional argument onto a data struct, checking for specified
 // positionals first before adding it to a fallback field.
-@(optimization_mode="size")
+@(optimization_mode="favor_size")
 push_positional :: #force_no_inline proc (model: ^$T, parser: ^Parser, arg: string) -> (error: Error) {
 	if bit_array.get(&parser.filled_pos, parser.filled_pos.max_index) {
 		// The max index is set, which means we're out of space.
@@ -74,7 +74,7 @@ register_field :: proc(parser: ^Parser, field: reflect.Struct_Field, index: int)
 }

 // Set a `-flag` argument, Odin-style.
-@(optimization_mode="size")
+@(optimization_mode="favor_size")
 set_odin_flag :: proc(model: ^$T, parser: ^Parser, name: string) -> (error: Error) {
 	// We make a special case for help requests.
 	switch name {
@@ -100,7 +100,7 @@ set_odin_flag :: proc(model: ^$T, parser: ^Parser, name: string) -> (error: Erro
 }

 // Set a `-flag` argument, UNIX-style.
-@(optimization_mode="size")
+@(optimization_mode="favor_size")
 set_unix_flag :: proc(model: ^$T, parser: ^Parser, name: string) -> (future_args: int, error: Error) {
 	// We make a special case for help requests.
 	switch name {
@@ -137,7 +137,7 @@ set_unix_flag :: proc(model: ^$T, parser: ^Parser, name: string) -> (future_args
 }

 // Set a `-flag:option` argument.
-@(optimization_mode="size")
+@(optimization_mode="favor_size")
 set_option :: proc(model: ^$T, parser: ^Parser, name, option: string) -> (error: Error) {
 	field, index := get_field_by_name(model, name) or_return

@@ -176,7 +176,7 @@ set_option :: proc(model: ^$T, parser: ^Parser, name, option: string) -> (error:
 }

 // Set a `-map:key=value` argument.
-@(optimization_mode="size")
+@(optimization_mode="favor_size")
 set_key_value :: proc(model: ^$T, parser: ^Parser, name, key, value: string) -> (error: Error) {
 	field, index := get_field_by_name(model, name) or_return

@@ -13,7 +13,7 @@ import "core:strings"
@require import "core:time/datetime"
 import "core:unicode/utf8"

-@(optimization_mode="size")
+@(optimization_mode="favor_size")
 parse_and_set_pointer_by_base_type :: proc(ptr: rawptr, str: string, type_info: ^runtime.Type_Info) -> bool {
 	bounded_int :: proc(value, min, max: i128) -> (result: i128, ok: bool) {
 		return value, min <= value && value <= max
@@ -202,7 +202,7 @@ parse_and_set_pointer_by_base_type :: proc(ptr: rawptr, str: string, type_info:
 // especially with files.
 //
 // We want to provide as informative as an error as we can.
-@(optimization_mode="size", disabled=NO_CORE_NAMED_TYPES)
+@(optimization_mode="favor_size", disabled=NO_CORE_NAMED_TYPES)
 parse_and_set_pointer_by_named_type :: proc(ptr: rawptr, str: string, data_type: typeid, arg_tag: string, out_error: ^Error) {
 	// Core types currently supported:
 	//
@@ -254,8 +254,8 @@ parse_and_set_pointer_by_named_type :: proc(ptr: rawptr, str: string, data_type:
 		}

 		handle, errno := os.open(str, mode, perms)
-		if errno != 0 {
-			// NOTE(Feoramund): os.Errno is system-dependent, and there's
+		if errno != nil {
+			// NOTE(Feoramund): os.Error is system-dependent, and there's
 			// currently no good way to translate them all into strings.
 			//
 			// The upcoming `os2` package will hopefully solve this.
@@ -320,7 +320,7 @@ parse_and_set_pointer_by_named_type :: proc(ptr: rawptr, str: string, data_type:
 	}
 }

-@(optimization_mode="size")
+@(optimization_mode="favor_size")
 set_unbounded_integer_by_type :: proc(ptr: rawptr, value: $T, data_type: typeid) where intrinsics.type_is_integer(T) {
 	switch data_type {
 	case i8:      (^i8)     (ptr)^ = cast(i8)      value
@@ -367,7 +367,7 @@ set_unbounded_integer_by_type :: proc(ptr: rawptr, value: $T, data_type: typeid)
 	}
 }

-@(optimization_mode="size")
+@(optimization_mode="favor_size")
 parse_and_set_pointer_by_type :: proc(ptr: rawptr, str: string, type_info: ^runtime.Type_Info, arg_tag: string) -> (error: Error) {
 	#partial switch specific_type_info in type_info.variant {
 	case runtime.Type_Info_Named:
@@ -1,5 +1,5 @@
 //+private
-//+build !freebsd !netbsd !openbsd
+//+build !netbsd !openbsd
 package flags

 import "core:net"
@@ -11,7 +11,7 @@ package flags
@require import "core:strings"

 // This proc is used to assert that `T` meets the expectations of the library.
-@(optimization_mode="size", disabled=ODIN_DISABLE_ASSERT)
+@(optimization_mode="favor_size", disabled=ODIN_DISABLE_ASSERT)
 validate_structure :: proc(model_type: $T, style: Parsing_Style, loc := #caller_location) {
 	positionals_assigned_so_far: bit_array.Bit_Array
 	defer bit_array.destroy(&positionals_assigned_so_far)
@@ -162,7 +162,7 @@ validate_structure :: proc(model_type: $T, style: Parsing_Style, loc := #caller_

 // Validate that all the required arguments are set and that the set arguments
 // are up to the program's expectations.
-@(optimization_mode="size")
+@(optimization_mode="favor_size")
 validate_arguments :: proc(model: ^$T, parser: ^Parser) -> Error {
 	check_fields: for field, index in reflect.struct_fields_zipped(T) {
 		was_set := bit_array.get(&parser.fields_set, index)
@@ -32,7 +32,7 @@ Inputs:
 Returns:
 - error: A union of errors; parsing, file open, a help request, or validation.
 */
-@(optimization_mode="size")
+@(optimization_mode="favor_size")
 parse :: proc(
 	model: ^$T,
 	args: []string,
@@ -17,7 +17,7 @@ Inputs:
 - program: The name of the program, usually the first argument to `os.args`.
 - style: The argument parsing style, required to show flags in the proper style.
 */
-@(optimization_mode="size")
+@(optimization_mode="favor_size")
 write_usage :: proc(out: io.Writer, data_type: typeid, program: string = "", style: Parsing_Style = .Odin) {
 	// All flags get their tags parsed so they can be reasoned about later.
 	Flag :: struct {
@@ -19,7 +19,7 @@ Inputs:
 - allocator: (default: context.allocator)
 - loc: The caller location for debugging purposes (default: #caller_location)
 */
-@(optimization_mode="size")
+@(optimization_mode="favor_size")
 parse_or_exit :: proc(
 	model: ^$T,
 	program_args: []string,
@@ -63,7 +63,7 @@ Inputs:
 - error: The error returned from `parse`.
 - style: The argument parsing style, required to show flags in the proper style, when usage is shown.
 */
-@(optimization_mode="size")
+@(optimization_mode="favor_size")
 print_errors :: proc(data_type: typeid, error: Error, program: string, style: Parsing_Style = .Odin) {
 	stderr := os.stream_from_handle(os.stderr)
 	stdout := os.stream_from_handle(os.stdout)
@@ -334,6 +334,27 @@ panicf :: proc(fmt: string, args: ..any, loc := #caller_location) -> ! {
 	message := tprintf(fmt, ..args)
 	p("Panic", message, loc)
 }
+
+// 	Creates a formatted C string
+//
+// 	*Allocates Using Context's Allocator*
+//
+// 	Inputs:
+// 	- args: A variadic list of arguments to be formatted.
+// 	- sep: An optional separator string (default is a single space).
+//
+// 	Returns: A formatted C string.
+//
+@(require_results)
+caprint :: proc(args: ..any, sep := " ", allocator := context.allocator) -> cstring {
+	str: strings.Builder
+	strings.builder_init(&str, allocator)
+	sbprint(&str, ..args, sep=sep)
+	strings.write_byte(&str, 0)
+	s := strings.to_string(str)
+	return cstring(raw_data(s))
+}
+
 // Creates a formatted C string
 //
 // *Allocates Using Context's Allocator*
@@ -346,9 +367,9 @@ panicf :: proc(fmt: string, args: ..any, loc := #caller_location) -> ! {
 // Returns: A formatted C string
 //
@(require_results)
-caprintf :: proc(format: string, args: ..any, newline := false) -> cstring {
+caprintf :: proc(format: string, args: ..any, allocator := context.allocator, newline := false) -> cstring {
 	str: strings.Builder
-	strings.builder_init(&str)
+	strings.builder_init(&str, allocator)
 	sbprintf(&str, format, ..args, newline=newline)
 	strings.write_byte(&str, 0)
 	s := strings.to_string(str)
@@ -365,8 +386,8 @@ caprintf :: proc(format: string, args: ..any, newline := false) -> cstring {
 // Returns: A formatted C string
 //
@(require_results)
-caprintfln :: proc(format: string, args: ..any) -> cstring {
-	return caprintf(format, ..args, newline=true)
+caprintfln :: proc(format: string, args: ..any, allocator := context.allocator) -> cstring {
+	return caprintf(format, ..args, allocator=allocator, newline=true)
 }
 // 	Creates a formatted C string
 //
@@ -380,12 +401,7 @@ caprintfln :: proc(format: string, args: ..any) -> cstring {
 //
@(require_results)
 ctprint :: proc(args: ..any, sep := " ") -> cstring {
-	str: strings.Builder
-	strings.builder_init(&str, context.temp_allocator)
-	sbprint(&str, ..args, sep=sep)
-	strings.write_byte(&str, 0)
-	s := strings.to_string(str)
-	return cstring(raw_data(s))
+	return caprint(args=args, sep=sep, allocator=context.temp_allocator)
 }
 // Creates a formatted C string
 //
@@ -400,12 +416,7 @@ ctprint :: proc(args: ..any, sep := " ") -> cstring {
 //
@(require_results)
 ctprintf :: proc(format: string, args: ..any, newline := false) -> cstring {
-	str: strings.Builder
-	strings.builder_init(&str, context.temp_allocator)
-	sbprintf(&str, format, ..args, newline=newline)
-	strings.write_byte(&str, 0)
-	s := strings.to_string(str)
-	return cstring(raw_data(s))
+	return caprintf(format=format, args=args, allocator=context.temp_allocator, newline=newline)
 }
 // Creates a formatted C string, followed by a newline.
 //
@@ -419,7 +430,7 @@ ctprintf :: proc(format: string, args: ..any, newline := false) -> cstring {
 //
@(require_results)
 ctprintfln :: proc(format: string, args: ..any) -> cstring {
-	return ctprintf(format, ..args, newline=true)
+	return caprintf(format=format, args=args, allocator=context.temp_allocator, newline=true)
 }
 // Formats using the default print settings and writes to the given strings.Builder
 //
@@ -951,10 +962,10 @@ fmt_bad_verb :: proc(fi: ^Info, verb: rune) {
 	io.write_string(fi.writer, "%!", &fi.n)
 	io.write_rune(fi.writer, verb, &fi.n)
 	io.write_byte(fi.writer, '(', &fi.n)
-	if fi.arg.id != nil {
-		reflect.write_typeid(fi.writer, fi.arg.id, &fi.n)
+	if arg := fi.arg; arg != nil {
+		reflect.write_typeid(fi.writer, arg.id, &fi.n)
 		io.write_byte(fi.writer, '=', &fi.n)
-		fmt_value(fi, fi.arg, 'v')
+		fmt_value(fi, arg, 'v')
 	} else {
 		io.write_string(fi.writer, "<nil>", &fi.n)
 	}
@@ -1861,7 +1872,7 @@ handle_tag :: proc(state: ^Info_State, data: rawptr, info: reflect.Type_Info_Str
 		if optional_len == nil {
 			return
 		}
-		for f, i in info.names {
+		for f, i in info.names[:info.field_count] {
 			if f != field_name {
 				continue
 			}
@@ -1965,7 +1976,7 @@ fmt_struct :: proc(fi: ^Info, v: any, the_verb: rune, info: runtime.Type_Info_St
 		fmt_bad_verb(fi, the_verb)
 		return
 	}
-	if info.is_raw_union {
+	if .raw_union in info.flags {
 		if type_name == "" {
 			io.write_string(fi.writer, "(raw union)", &fi.n)
 		} else {
@@ -1989,7 +2000,7 @@ fmt_struct :: proc(fi: ^Info, v: any, the_verb: rune, info: runtime.Type_Info_St
 	// fi.hash = false;
 	fi.indent += 1

-	is_empty := len(info.names) == 0
+	is_empty := info.field_count == 0

 	if !is_soa && hash && !is_empty {
 		io.write_byte(fi.writer, '\n', &fi.n)
@@ -2010,17 +2021,17 @@ fmt_struct :: proc(fi: ^Info, v: any, the_verb: rune, info: runtime.Type_Info_St
 			base_type_name = v.name
 		}

-		actual_field_count := len(info.names)
+		actual_field_count := info.field_count

 		n := uintptr(info.soa_len)

 		if info.soa_kind == .Slice {
-			actual_field_count = len(info.names)-1 // len
+			actual_field_count = info.field_count-1 // len

 			n = uintptr((^int)(uintptr(v.data) + info.offsets[actual_field_count])^)

 		} else if info.soa_kind == .Dynamic {
-			actual_field_count = len(info.names)-3 // len, cap, allocator
+			actual_field_count = info.field_count-3 // len, cap, allocator

 			n = uintptr((^int)(uintptr(v.data) + info.offsets[actual_field_count])^)
 		}
@@ -2099,7 +2110,7 @@ fmt_struct :: proc(fi: ^Info, v: any, the_verb: rune, info: runtime.Type_Info_St
 		}
 	} else {
 		field_count := -1
-		for name, i in info.names {
+		for name, i in info.names[:info.field_count] {
 			optional_len: int = -1
 			use_nul_termination: bool = false
 			verb := the_verb if the_verb == 'w' else 'v'
@@ -2605,7 +2616,7 @@ fmt_bit_field :: proc(fi: ^Info, v: any, verb: rune, info: runtime.Type_Info_Bit


 	field_count := -1
-	for name, i in info.names {
+	for name, i in info.names[:info.field_count] {
 		field_verb := verb
 		if handle_bit_field_tag(v.data, info, i, &field_verb) {
 			continue
@@ -2717,7 +2728,8 @@ fmt_value :: proc(fi: ^Info, v: any, verb: rune) {
 						}

 					case runtime.Type_Info_Struct,
-					     runtime.Type_Info_Union:
+					     runtime.Type_Info_Union,
+					     runtime.Type_Info_Bit_Field:
 						if ptr == nil {
 							io.write_string(fi.writer, "<nil>", &fi.n)
 							return
@@ -2751,9 +2763,11 @@ fmt_value :: proc(fi: ^Info, v: any, verb: rune) {
 			elem := runtime.type_info_base(info.elem)
 			if elem != nil {
 				if n, ok := fi.optional_len.?; ok {
+					fi.optional_len = nil
 					fmt_array(fi, ptr, n, elem.size, elem, verb)
 					return
 				} else if fi.use_nul_termination {
+					fi.use_nul_termination = false
 					fmt_array_nul_terminated(fi, ptr, -1, elem.size, elem, verb)
 					return
 				}
@@ -2855,8 +2869,10 @@ fmt_value :: proc(fi: ^Info, v: any, verb: rune) {
 		n := info.count
 		ptr := v.data
 		if ol, ok := fi.optional_len.?; ok {
+			fi.optional_len = nil
 			n = min(n, ol)
 		} else if fi.use_nul_termination {
+			fi.use_nul_termination = false
 			fmt_array_nul_terminated(fi, ptr, n, info.elem_size, info.elem, verb)
 			return
 		}
@@ -2867,8 +2883,10 @@ fmt_value :: proc(fi: ^Info, v: any, verb: rune) {
 		n := slice.len
 		ptr := slice.data
 		if ol, ok := fi.optional_len.?; ok {
+			fi.optional_len = nil
 			n = min(n, ol)
 		} else if fi.use_nul_termination {
+			fi.use_nul_termination = false
 			fmt_array_nul_terminated(fi, ptr, n, info.elem_size, info.elem, verb)
 			return
 		}
@@ -2879,8 +2897,10 @@ fmt_value :: proc(fi: ^Info, v: any, verb: rune) {
 		n := array.len
 		ptr := array.data
 		if ol, ok := fi.optional_len.?; ok {
+			fi.optional_len = nil
 			n = min(n, ol)
 		} else if fi.use_nul_termination {
+			fi.use_nul_termination = false
 			fmt_array_nul_terminated(fi, ptr, n, info.elem_size, info.elem, verb)
 			return
 		}
--- a/Show More
+++ b/Show More