diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ca3d87b12..73cd3493b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -6,7 +6,7 @@ jobs:
     name: NetBSD Build, Check, and Test
     runs-on: ubuntu-latest
     env:
-      PKGSRC_BRANCH: 2024Q1
+      PKGSRC_BRANCH: 2024Q2
     steps:
     - uses: actions/checkout@v4
     - name: Build, Check, and Test
@@ -19,10 +19,7 @@ jobs:
         copyback: false
         prepare: |
           PKG_PATH="https://cdn.NetBSD.org/pub/pkgsrc/packages/NetBSD/$(uname -p)/$(uname -r | cut -d_ -f1)_${PKGSRC_BRANCH}/All" /usr/sbin/pkg_add pkgin
-          pkgin -y in gmake git bash python311
-          pkgin -y in libxml2 perl zstd
-          /usr/sbin/pkg_add https://github.com/andreas-jonsson/llvm17-netbsd-bin/releases/download/pkgsrc-current/llvm-17.0.6.tgz
-          /usr/sbin/pkg_add https://github.com/andreas-jonsson/llvm17-netbsd-bin/releases/download/pkgsrc-current/clang-17.0.6.tgz
+          pkgin -y in gmake git bash python311 llvm clang
           ln -s /usr/pkg/bin/python3.11 /usr/bin/python3
         run: |
           git config --global --add safe.directory $(pwd)
@@ -91,13 +88,13 @@ jobs:
       - name: Download LLVM (MacOS Intel)
         if: matrix.os == 'macos-13'
         run: |
-          brew install llvm@17
+          brew install llvm@17 lua@5.4
           echo "/usr/local/opt/llvm@17/bin" >> $GITHUB_PATH
 
       - name: Download LLVM (MacOS ARM)
         if: matrix.os == 'macos-14'
         run: |
-          brew install llvm@17 wasmtime
+          brew install llvm@17 wasmtime lua@5.4
           echo "/opt/homebrew/opt/llvm@17/bin" >> $GITHUB_PATH
 
       - name: Build Odin
@@ -207,6 +204,7 @@ jobs:
         shell: cmd
         run: |
           call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat
+          copy vendor\lua\5.4\windows\*.dll .
           odin test tests/vendor -all-packages -define:ODIN_TEST_FANCY=false
       - name: Odin internals tests
         shell: cmd
diff --git a/.gitignore b/.gitignore
index c8a66d288..4c3c98b72 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,38 +24,6 @@ bld/
 ![Cc]ore/[Ll]og/
 tests/documentation/verify/
 tests/documentation/all.odin-doc
-tests/internal/test_map
-tests/internal/test_pow
-tests/internal/test_rtti
-tests/core/test_base64
-tests/core/test_cbor
-tests/core/test_core_compress
-tests/core/test_core_container
-tests/core/test_core_filepath
-tests/core/test_core_fmt
-tests/core/test_core_i18n
-tests/core/test_core_image
-tests/core/test_core_libc
-tests/core/test_core_match
-tests/core/test_core_math
-tests/core/test_core_net
-tests/core/test_core_os_exit
-tests/core/test_core_reflect
-tests/core/test_core_strings
-tests/core/test_core_time
-tests/core/test_crypto
-tests/core/test_hash
-tests/core/test_hex
-tests/core/test_hxa
-tests/core/test_json
-tests/core/test_linalg_glsl_math
-tests/core/test_noise
-tests/core/test_varint
-tests/core/test_xml
-tests/core/test_core_slice
-tests/core/test_core_thread
-tests/core/test_core_runtime
-tests/vendor/vendor_botan
 # Visual Studio 2015 cache/options directory
 .vs/
 # Visual Studio Code options directory
@@ -63,6 +31,7 @@ tests/vendor/vendor_botan
 # Uncomment if you have tasks that create the project's static files in wwwroot
 #wwwroot/
 demo
+benchmark
 
 # MSTest test Results
 [Tt]est[Rr]esult*/
diff --git a/base/intrinsics/intrinsics.odin b/base/intrinsics/intrinsics.odin
index 8a16ca40e..37a42b904 100644
--- a/base/intrinsics/intrinsics.odin
+++ b/base/intrinsics/intrinsics.odin
@@ -38,9 +38,12 @@ count_leading_zeros  :: proc(x: $T) -> T where type_is_integer(T) || type_is_sim
 reverse_bits         :: proc(x: $T) -> T where type_is_integer(T) || type_is_simd_vector(T) ---
 byte_swap            :: proc(x: $T) -> T where type_is_integer(T) || type_is_float(T) ---
 
-overflow_add :: proc(lhs, rhs: $T) -> (T, bool) ---
-overflow_sub :: proc(lhs, rhs: $T) -> (T, bool) ---
-overflow_mul :: proc(lhs, rhs: $T) -> (T, bool) ---
+overflow_add :: proc(lhs, rhs: $T) -> (T, bool) where type_is_integer(T) #optional_ok ---
+overflow_sub :: proc(lhs, rhs: $T) -> (T, bool) where type_is_integer(T) #optional_ok ---
+overflow_mul :: proc(lhs, rhs: $T) -> (T, bool) where type_is_integer(T) #optional_ok ---
+
+add_sat :: proc(lhs, rhs: $T) -> T where type_is_integer(T) ---
+sub_sat :: proc(lhs, rhs: $T) -> T where type_is_integer(T) ---
 
 sqrt :: proc(x: $T) -> T where type_is_float(T) || (type_is_simd_vector(T) && type_is_float(type_elem_type(T))) ---
 
diff --git a/base/runtime/core.odin b/base/runtime/core.odin
index a758a2fdd..56aaefaa9 100644
--- a/base/runtime/core.odin
+++ b/base/runtime/core.odin
@@ -66,7 +66,7 @@ Type_Info_Named :: struct {
 	name: string,
 	base: ^Type_Info,
 	pkg:  string,
-	loc:  Source_Code_Location,
+	loc:  ^Source_Code_Location,
 }
 Type_Info_Integer    :: struct {signed: bool, endianness: Platform_Endianness}
 Type_Info_Rune       :: struct {}
@@ -112,23 +112,32 @@ Type_Info_Parameters :: struct { // Only used for procedures parameters and resu
 }
 Type_Info_Tuple :: Type_Info_Parameters // Will be removed eventually
 
-Type_Info_Struct :: struct {
-	types:        []^Type_Info,
-	names:        []string,
-	offsets:      []uintptr,
-	usings:       []bool,
-	tags:         []string,
-	is_packed:    bool,
-	is_raw_union: bool,
-	is_no_copy:   bool,
-	custom_align: bool,
+Type_Info_Struct_Flags :: distinct bit_set[Type_Info_Struct_Flag; u8]
+Type_Info_Struct_Flag :: enum u8 {
+	packed    = 0,
+	raw_union = 1,
+	no_copy   = 2,
+	align     = 3,
+}
 
-	equal: Equal_Proc, // set only when the struct has .Comparable set but does not have .Simple_Compare set
+Type_Info_Struct :: struct {
+	// Slice these with `field_count`
+	types:   [^]^Type_Info `fmt:"v,field_count"`,
+	names:   [^]string     `fmt:"v,field_count"`,
+	offsets: [^]uintptr    `fmt:"v,field_count"`,
+	usings:  [^]bool       `fmt:"v,field_count"`,
+	tags:    [^]string     `fmt:"v,field_count"`,
+
+	field_count: i32,
+
+	flags: Type_Info_Struct_Flags,
 
 	// These are only set iff this structure is an SOA structure
 	soa_kind:      Type_Info_Struct_Soa_Kind,
+	soa_len:       i32,
 	soa_base_type: ^Type_Info,
-	soa_len:       int,
+
+	equal: Equal_Proc, // set only when the struct has .Comparable set but does not have .Simple_Compare set
 }
 Type_Info_Union :: struct {
 	variants:     []^Type_Info,
@@ -142,9 +151,9 @@ Type_Info_Union :: struct {
 	shared_nil:   bool,
 }
 Type_Info_Enum :: struct {
-	base:      ^Type_Info,
-	names:     []string,
-	values:    []Type_Info_Enum_Value,
+	base:   ^Type_Info,
+	names:  []string,
+	values: []Type_Info_Enum_Value,
 }
 Type_Info_Map :: struct {
 	key:      ^Type_Info,
@@ -187,11 +196,12 @@ Type_Info_Soa_Pointer :: struct {
 }
 Type_Info_Bit_Field :: struct {
 	backing_type: ^Type_Info,
-	names:        []string,
-	types:        []^Type_Info,
-	bit_sizes:    []uintptr,
-	bit_offsets:  []uintptr,
-	tags:         []string,
+	names:        [^]string     `fmt:"v,field_count"`,
+	types:        [^]^Type_Info `fmt:"v,field_count"`,
+	bit_sizes:    [^]uintptr    `fmt:"v,field_count"`,
+	bit_offsets:  [^]uintptr    `fmt:"v,field_count"`,
+	tags:         [^]string     `fmt:"v,field_count"`,
+	field_count:  int,
 }
 
 Type_Info_Flag :: enum u8 {
diff --git a/base/runtime/core_builtin.odin b/base/runtime/core_builtin.odin
index ff87316f2..38ad95be8 100644
--- a/base/runtime/core_builtin.odin
+++ b/base/runtime/core_builtin.odin
@@ -333,16 +333,23 @@ make_dynamic_array_len :: proc($T: typeid/[dynamic]$E, #any_int len: int, alloca
 // Note: Prefer using the procedure group `make`.
 @(builtin, require_results)
 make_dynamic_array_len_cap :: proc($T: typeid/[dynamic]$E, #any_int len: int, #any_int cap: int, allocator := context.allocator, loc := #caller_location) -> (array: T, err: Allocator_Error) #optional_allocator_error {
-	make_dynamic_array_error_loc(loc, len, cap)
-	array.allocator = allocator // initialize allocator before just in case it fails to allocate any memory
-	data := mem_alloc_bytes(size_of(E)*cap, align_of(E), allocator, loc) or_return
-	s := Raw_Dynamic_Array{raw_data(data), len, cap, allocator}
-	if data == nil && size_of(E) != 0 {
-		s.len, s.cap = 0, 0
-	}
-	array = transmute(T)s
+	err = _make_dynamic_array_len_cap((^Raw_Dynamic_Array)(&array), size_of(E), align_of(E), len, cap, allocator, loc)
 	return
 }
+
+@(require_results)
+_make_dynamic_array_len_cap :: proc(array: ^Raw_Dynamic_Array, size_of_elem, align_of_elem: int, #any_int len: int, #any_int cap: int, allocator := context.allocator, loc := #caller_location) -> (err: Allocator_Error) {
+	make_dynamic_array_error_loc(loc, len, cap)
+	array.allocator = allocator // initialize allocator before just in case it fails to allocate any memory
+	data := mem_alloc_bytes(size_of_elem*cap, align_of_elem, allocator, loc) or_return
+	use_zero := data == nil && size_of_elem != 0
+	array.data = raw_data(data)
+	array.len = 0 if use_zero else len
+	array.cap = 0 if use_zero else cap
+	array.allocator = allocator
+	return
+}
+
 // `make_map` allocates and initializes a dynamic array. Like `new`, the first argument is a type, not a value.
 // Unlike `new`, `make`'s return value is the same as the type of its argument, not a pointer to it.
 //
@@ -440,107 +447,103 @@ delete_key :: proc(m: ^$T/map[$K]$V, key: K) -> (deleted_key: K, deleted_value:
 	return
 }
 
-_append_elem :: #force_inline proc(array: ^$T/[dynamic]$E, arg: E, should_zero: bool, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
+_append_elem :: #force_inline proc(array: ^Raw_Dynamic_Array, size_of_elem, align_of_elem: int, arg_ptr: rawptr, should_zero: bool, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
 	if array == nil {
-		return 0, nil
+		return
 	}
-	when size_of(E) == 0 {
-		array := (^Raw_Dynamic_Array)(array)
-		array.len += 1
-		return 1, nil
-	} else {
-		if cap(array) < len(array)+1 {
-			// Same behavior as _append_elems but there's only one arg, so we always just add DEFAULT_DYNAMIC_ARRAY_CAPACITY.
-			cap := 2 * cap(array) + DEFAULT_DYNAMIC_ARRAY_CAPACITY
 
-			// do not 'or_return' here as it could be a partial success
-			if should_zero {
-				err = reserve(array, cap, loc)
-			} else {
-				err = non_zero_reserve(array, cap, loc) 
-			}
-		}
-		if cap(array)-len(array) > 0 {
-			a := (^Raw_Dynamic_Array)(array)
-			when size_of(E) != 0 {
-				data := ([^]E)(a.data)
-				assert(data != nil, loc=loc)
-				data[a.len] = arg
-			}
-			a.len += 1
-			return 1, err
-		}
-		return 0, err
+	if array.cap < array.len+1 {
+		// Same behavior as _append_elems but there's only one arg, so we always just add DEFAULT_DYNAMIC_ARRAY_CAPACITY.
+		cap := 2 * array.cap + DEFAULT_DYNAMIC_ARRAY_CAPACITY
+
+		// do not 'or_return' here as it could be a partial success
+		err = _reserve_dynamic_array(array, size_of_elem, align_of_elem, cap, should_zero, loc)
 	}
+	if array.cap-array.len > 0 {
+		data := ([^]byte)(array.data)
+		assert(data != nil, loc=loc)
+		data = data[array.len*size_of_elem:]
+		intrinsics.mem_copy_non_overlapping(data, arg_ptr, size_of_elem)
+		array.len += 1
+		n = 1
+	}
+	return
 }
 
 @builtin
 append_elem :: proc(array: ^$T/[dynamic]$E, #no_broadcast arg: E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
-	return _append_elem(array, arg, true, loc=loc)
+	when size_of(E) == 0 {
+		(^Raw_Dynamic_Array)(array).len += 1
+		return 1, nil
+	} else {
+		arg := arg
+		return _append_elem((^Raw_Dynamic_Array)(array), size_of(E), align_of(E), &arg, true, loc=loc)
+	}
 }
 
 @builtin
 non_zero_append_elem :: proc(array: ^$T/[dynamic]$E, #no_broadcast arg: E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
-	return _append_elem(array, arg, false, loc=loc)
+	when size_of(E) == 0 {
+		(^Raw_Dynamic_Array)(array).len += 1
+		return 1, nil
+	} else {
+		arg := arg
+		return _append_elem((^Raw_Dynamic_Array)(array), size_of(E), align_of(E), &arg, false, loc=loc)
+	}
 }
 
-_append_elems :: #force_inline proc(array: ^$T/[dynamic]$E, should_zero: bool, loc := #caller_location, args: ..E) -> (n: int, err: Allocator_Error) #optional_allocator_error {
+_append_elems :: #force_inline proc(array: ^Raw_Dynamic_Array, size_of_elem, align_of_elem: int, should_zero: bool, loc := #caller_location, args: rawptr, arg_len: int) -> (n: int, err: Allocator_Error) #optional_allocator_error {
 	if array == nil {
 		return 0, nil
 	}
 
-	arg_len := len(args)
 	if arg_len <= 0 {
 		return 0, nil
 	}
 
-	when size_of(E) == 0 {
-		array := (^Raw_Dynamic_Array)(array)
-		array.len += arg_len
-		return arg_len, nil
-	} else {
-		if cap(array) < len(array)+arg_len {
-			cap := 2 * cap(array) + max(DEFAULT_DYNAMIC_ARRAY_CAPACITY, arg_len)
+	if array.cap < array.len+arg_len {
+		cap := 2 * array.cap + max(DEFAULT_DYNAMIC_ARRAY_CAPACITY, arg_len)
 
-			// do not 'or_return' here as it could be a partial success
-			if should_zero {
-				err = reserve(array, cap, loc)
-			} else {
-				err = non_zero_reserve(array, cap, loc)
-			}
-		}
-		arg_len = min(cap(array)-len(array), arg_len)
-		if arg_len > 0 {
-			a := (^Raw_Dynamic_Array)(array)
-			when size_of(E) != 0 {
-				data := ([^]E)(a.data)
-				assert(data != nil, loc=loc)
-				intrinsics.mem_copy(&data[a.len], raw_data(args), size_of(E) * arg_len)
-			}
-			a.len += arg_len
-		}
-		return arg_len, err
+		// do not 'or_return' here as it could be a partial success
+		err = _reserve_dynamic_array(array, size_of_elem, align_of_elem, cap, should_zero, loc)
 	}
+	arg_len := arg_len
+	arg_len = min(array.cap-array.len, arg_len)
+	if arg_len > 0 {
+		data := ([^]byte)(array.data)
+		assert(data != nil, loc=loc)
+		data = data[array.len*size_of_elem:]
+		intrinsics.mem_copy(data, args, size_of_elem * arg_len) // must be mem_copy (overlapping)
+		array.len += arg_len
+	}
+	return arg_len, err
 }
 
 @builtin
 append_elems :: proc(array: ^$T/[dynamic]$E, #no_broadcast args: ..E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
-	return _append_elems(array, true, loc, ..args)
+	when size_of(E) == 0 {
+		a := (^Raw_Dynamic_Array)(array)
+		a.len += len(args)
+		return len(args), nil
+	} else {
+		return _append_elems((^Raw_Dynamic_Array)(array), size_of(E), align_of(E), true, loc, raw_data(args), len(args))
+	}
 }
 
 @builtin
 non_zero_append_elems :: proc(array: ^$T/[dynamic]$E, #no_broadcast args: ..E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
-	return _append_elems(array, false, loc, ..args)
+	when size_of(E) == 0 {
+		a := (^Raw_Dynamic_Array)(array)
+		a.len += len(args)
+		return len(args), nil
+	} else {
+		return _append_elems((^Raw_Dynamic_Array)(array), size_of(E), align_of(E), false, loc, raw_data(args), len(args))
+	}
 }
 
 // The append_string built-in procedure appends a string to the end of a [dynamic]u8 like type
 _append_elem_string :: proc(array: ^$T/[dynamic]$E/u8, arg: $A/string, should_zero: bool, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
-	args := transmute([]E)arg
-	if should_zero { 
-		return append_elems(array, ..args, loc=loc)
-	} else {
-		return non_zero_append_elems(array, ..args, loc=loc)
-	}
+	return _append_elems((^Raw_Dynamic_Array)(array), 1, 1, should_zero, loc, raw_data(arg), len(arg))
 }
 
 @builtin
@@ -679,7 +682,7 @@ assign_at_elem :: proc(array: ^$T/[dynamic]$E, index: int, arg: E, loc := #calle
 
 
 @builtin
-assign_at_elems :: proc(array: ^$T/[dynamic]$E, index: int, args: ..E, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
+assign_at_elems :: proc(array: ^$T/[dynamic]$E, index: int, #no_broadcast args: ..E, loc := #caller_location) -> (ok: bool, err: Allocator_Error) #no_bounds_check #optional_allocator_error {
 	new_size := index + len(args)
 	if len(args) == 0 {
 		ok = true
@@ -729,11 +732,10 @@ clear_dynamic_array :: proc "contextless" (array: ^$T/[dynamic]$E) {
 // `reserve_dynamic_array` will try to reserve memory of a passed dynamic array or map to the requested element count (setting the `cap`).
 //
 // Note: Prefer the procedure group `reserve`.
-_reserve_dynamic_array :: #force_inline proc(array: ^$T/[dynamic]$E, capacity: int, should_zero: bool, loc := #caller_location) -> Allocator_Error {
-	if array == nil {
+_reserve_dynamic_array :: #force_inline proc(a: ^Raw_Dynamic_Array, size_of_elem, align_of_elem: int, capacity: int, should_zero: bool, loc := #caller_location) -> Allocator_Error {
+	if a == nil {
 		return nil
 	}
-	a := (^Raw_Dynamic_Array)(array)
 
 	if capacity <= a.cap {
 		return nil
@@ -744,15 +746,15 @@ _reserve_dynamic_array :: #force_inline proc(array: ^$T/[dynamic]$E, capacity: i
 	}
 	assert(a.allocator.procedure != nil)
 
-	old_size  := a.cap * size_of(E)
-	new_size  := capacity * size_of(E)
+	old_size  := a.cap * size_of_elem
+	new_size  := capacity * size_of_elem
 	allocator := a.allocator
 
 	new_data: []byte
 	if should_zero {
-		new_data = mem_resize(a.data, old_size, new_size, align_of(E), allocator, loc) or_return
+		new_data = mem_resize(a.data, old_size, new_size, align_of_elem, allocator, loc) or_return
 	} else {
-		new_data = non_zero_mem_resize(a.data, old_size, new_size, align_of(E), allocator, loc) or_return
+		new_data = non_zero_mem_resize(a.data, old_size, new_size, align_of_elem, allocator, loc) or_return
 	}
 	if new_data == nil && new_size > 0 {
 		return .Out_Of_Memory
@@ -765,26 +767,23 @@ _reserve_dynamic_array :: #force_inline proc(array: ^$T/[dynamic]$E, capacity: i
 
 @builtin
 reserve_dynamic_array :: proc(array: ^$T/[dynamic]$E, #any_int capacity: int, loc := #caller_location) -> Allocator_Error {
-	return _reserve_dynamic_array(array, capacity, true, loc)
+	return _reserve_dynamic_array((^Raw_Dynamic_Array)(array), size_of(E), align_of(E), capacity, true, loc)
 }
 
 @builtin
 non_zero_reserve_dynamic_array :: proc(array: ^$T/[dynamic]$E, #any_int capacity: int, loc := #caller_location) -> Allocator_Error {
-	return _reserve_dynamic_array(array, capacity, false, loc)
+	return _reserve_dynamic_array((^Raw_Dynamic_Array)(array), size_of(E), align_of(E), capacity, false, loc)
 }
 
-// `resize_dynamic_array` will try to resize memory of a passed dynamic array or map to the requested element count (setting the `len`, and possibly `cap`).
-//
-// Note: Prefer the procedure group `resize`
-_resize_dynamic_array :: #force_inline proc(array: ^$T/[dynamic]$E, length: int, should_zero: bool, loc := #caller_location) -> Allocator_Error {
-	if array == nil {
+
+_resize_dynamic_array :: #force_inline proc(a: ^Raw_Dynamic_Array, size_of_elem, align_of_elem: int, length: int, should_zero: bool, loc := #caller_location) -> Allocator_Error {
+	if a == nil {
 		return nil
 	}
-	a := (^Raw_Dynamic_Array)(array)
 
 	if length <= a.cap {
 		if should_zero && a.len < length {
-			intrinsics.mem_zero(([^]E)(a.data)[a.len:], (length-a.len)*size_of(E))
+			intrinsics.mem_zero(([^]byte)(a.data)[a.len*size_of_elem:], (length-a.len)*size_of_elem)
 		}
 		a.len = max(length, 0)
 		return nil
@@ -795,15 +794,15 @@ _resize_dynamic_array :: #force_inline proc(array: ^$T/[dynamic]$E, length: int,
 	}
 	assert(a.allocator.procedure != nil)
 
-	old_size  := a.cap * size_of(E)
-	new_size  := length * size_of(E)
+	old_size  := a.cap  * size_of_elem
+	new_size  := length * size_of_elem
 	allocator := a.allocator
 
 	new_data : []byte
 	if should_zero {
-		new_data = mem_resize(a.data, old_size, new_size, align_of(E), allocator, loc) or_return
+		new_data = mem_resize(a.data, old_size, new_size, align_of_elem, allocator, loc) or_return
 	} else {
-		new_data = non_zero_mem_resize(a.data, old_size, new_size, align_of(E), allocator, loc) or_return
+		new_data = non_zero_mem_resize(a.data, old_size, new_size, align_of_elem, allocator, loc) or_return
 	}
 	if new_data == nil && new_size > 0 {
 		return .Out_Of_Memory
@@ -815,14 +814,17 @@ _resize_dynamic_array :: #force_inline proc(array: ^$T/[dynamic]$E, length: int,
 	return nil
 }
 
+// `resize_dynamic_array` will try to resize memory of a passed dynamic array or map to the requested element count (setting the `len`, and possibly `cap`).
+//
+// Note: Prefer the procedure group `resize`
 @builtin
 resize_dynamic_array :: proc(array: ^$T/[dynamic]$E, #any_int length: int, loc := #caller_location) -> Allocator_Error {
-	return _resize_dynamic_array(array, length, true, loc=loc)
+	return _resize_dynamic_array((^Raw_Dynamic_Array)(array), size_of(E), align_of(E), length, true, loc=loc)
 }
 
 @builtin
 non_zero_resize_dynamic_array :: proc(array: ^$T/[dynamic]$E, #any_int length: int, loc := #caller_location) -> Allocator_Error {
-	return _resize_dynamic_array(array, length, false, loc=loc)
+	return _resize_dynamic_array((^Raw_Dynamic_Array)(array), size_of(E), align_of(E), length, false, loc=loc)
 }
 
 /*
@@ -837,10 +839,13 @@ non_zero_resize_dynamic_array :: proc(array: ^$T/[dynamic]$E, #any_int length: i
 	Note: Prefer the procedure group `shrink`
 */
 shrink_dynamic_array :: proc(array: ^$T/[dynamic]$E, new_cap := -1, loc := #caller_location) -> (did_shrink: bool, err: Allocator_Error) {
-	if array == nil {
+	return _shrink_dynamic_array((^Raw_Dynamic_Array)(array), size_of(E), align_of(E), new_cap, loc)
+}
+
+_shrink_dynamic_array :: proc(a: ^Raw_Dynamic_Array, size_of_elem, align_of_elem: int, new_cap := -1, loc := #caller_location) -> (did_shrink: bool, err: Allocator_Error) {
+	if a == nil {
 		return
 	}
-	a := (^Raw_Dynamic_Array)(array)
 
 	new_cap := new_cap if new_cap >= 0 else a.len
 
@@ -853,10 +858,10 @@ shrink_dynamic_array :: proc(array: ^$T/[dynamic]$E, new_cap := -1, loc := #call
 	}
 	assert(a.allocator.procedure != nil)
 
-	old_size := a.cap * size_of(E)
-	new_size := new_cap * size_of(E)
+	old_size := a.cap * size_of_elem
+	new_size := new_cap * size_of_elem
 
-	new_data := mem_resize(a.data, old_size, new_size, align_of(E), a.allocator, loc) or_return
+	new_data := mem_resize(a.data, old_size, new_size, align_of_elem, a.allocator, loc) or_return
 
 	a.data = raw_data(new_data)
 	a.len = min(new_cap, a.len)
diff --git a/base/runtime/core_builtin_soa.odin b/base/runtime/core_builtin_soa.odin
index f1b17cbef..7f7f5f086 100644
--- a/base/runtime/core_builtin_soa.odin
+++ b/base/runtime/core_builtin_soa.odin
@@ -352,7 +352,7 @@ non_zero_append_soa_elems :: proc(array: ^$T/#soa[dynamic]$E, #no_broadcast args
 }
 
 
-_append_soa_elems :: proc(array: ^$T/#soa[dynamic]$E, zero_memory: bool, #no_broadcast args: ..E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
+_append_soa_elems :: proc(array: ^$T/#soa[dynamic]$E, zero_memory: bool, #no_broadcast args: []E, loc := #caller_location) -> (n: int, err: Allocator_Error) #optional_allocator_error {
 	if array == nil {
 		return
 	}
diff --git a/base/runtime/dynamic_map_internal.odin b/base/runtime/dynamic_map_internal.odin
index 5ad155400..3dded7716 100644
--- a/base/runtime/dynamic_map_internal.odin
+++ b/base/runtime/dynamic_map_internal.odin
@@ -577,7 +577,7 @@ map_grow_dynamic :: proc "odin" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_Inf
 
 
 @(require_results)
-map_reserve_dynamic :: proc "odin" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_Info, new_capacity: uintptr, loc := #caller_location) -> Allocator_Error {
+map_reserve_dynamic :: #force_no_inline proc "odin" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_Info, new_capacity: uintptr, loc := #caller_location) -> Allocator_Error {
 	@(require_results)
 	ceil_log2 :: #force_inline proc "contextless" (x: uintptr) -> uintptr {
 		z := intrinsics.count_leading_zeros(x)
@@ -641,7 +641,7 @@ map_reserve_dynamic :: proc "odin" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_
 
 
 @(require_results)
-map_shrink_dynamic :: proc "odin" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_Info, loc := #caller_location) -> (did_shrink: bool, err: Allocator_Error) {
+map_shrink_dynamic :: #force_no_inline proc "odin" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_Info, loc := #caller_location) -> (did_shrink: bool, err: Allocator_Error) {
 	if m.allocator.procedure == nil {
 		m.allocator = context.allocator
 	}
@@ -688,7 +688,7 @@ map_shrink_dynamic :: proc "odin" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_I
 }
 
 @(require_results)
-map_free_dynamic :: proc "odin" (m: Raw_Map, info: ^Map_Info, loc := #caller_location) -> Allocator_Error {
+map_free_dynamic :: #force_no_inline proc "odin" (m: Raw_Map, info: ^Map_Info, loc := #caller_location) -> Allocator_Error {
 	ptr := rawptr(map_data(m))
 	size := int(map_total_allocation_size(uintptr(map_cap(m)), info))
 	err := mem_free_with_size(ptr, size, m.allocator, loc)
@@ -700,7 +700,7 @@ map_free_dynamic :: proc "odin" (m: Raw_Map, info: ^Map_Info, loc := #caller_loc
 }
 
 @(require_results)
-map_lookup_dynamic :: proc "contextless" (m: Raw_Map, #no_alias info: ^Map_Info, k: uintptr) -> (index: uintptr, ok: bool) {
+map_lookup_dynamic :: #force_no_inline proc "contextless" (m: Raw_Map, #no_alias info: ^Map_Info, k: uintptr) -> (index: uintptr, ok: bool) {
 	if map_len(m) == 0 {
 		return 0, false
 	}
@@ -723,7 +723,7 @@ map_lookup_dynamic :: proc "contextless" (m: Raw_Map, #no_alias info: ^Map_Info,
 	}
 }
 @(require_results)
-map_exists_dynamic :: proc "contextless" (m: Raw_Map, #no_alias info: ^Map_Info, k: uintptr) -> (ok: bool) {
+map_exists_dynamic :: #force_no_inline proc "contextless" (m: Raw_Map, #no_alias info: ^Map_Info, k: uintptr) -> (ok: bool) {
 	if map_len(m) == 0 {
 		return false
 	}
@@ -749,7 +749,7 @@ map_exists_dynamic :: proc "contextless" (m: Raw_Map, #no_alias info: ^Map_Info,
 
 
 @(require_results)
-map_erase_dynamic :: #force_inline proc "contextless" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_Info, k: uintptr) -> (old_k, old_v: uintptr, ok: bool) {
+map_erase_dynamic :: #force_no_inline proc "contextless" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_Info, k: uintptr) -> (old_k, old_v: uintptr, ok: bool) {
 	index := map_lookup_dynamic(m^, info, k) or_return
 	ks, vs, hs, _, _ := map_kvh_data_dynamic(m^, info)
 	hs[index] |= TOMBSTONE_MASK
diff --git a/base/runtime/print.odin b/base/runtime/print.odin
index 0262e8ef6..45f6f01ef 100644
--- a/base/runtime/print.odin
+++ b/base/runtime/print.odin
@@ -401,15 +401,16 @@ print_type :: #force_no_inline proc "contextless" (ti: ^Type_Info) {
 		}
 
 		print_string("struct ")
-		if info.is_packed    { print_string("#packed ") }
-		if info.is_raw_union { print_string("#raw_union ") }
-		if info.custom_align {
+		if .packed    in info.flags { print_string("#packed ") }
+		if .raw_union in info.flags { print_string("#raw_union ") }
+		if .no_copy   in info.flags { print_string("#no_copy ") }
+		if .align in info.flags {
 			print_string("#align(")
 			print_u64(u64(ti.align))
 			print_string(") ")
 		}
 		print_byte('{')
-		for name, i in info.names {
+		for name, i in info.names[:info.field_count] {
 			if i > 0 { print_string(", ") }
 			print_string(name)
 			print_string(": ")
@@ -469,7 +470,7 @@ print_type :: #force_no_inline proc "contextless" (ti: ^Type_Info) {
 		print_string("bit_field ")
 		print_type(info.backing_type)
 		print_string(" {")
-		for name, i in info.names {
+		for name, i in info.names[:info.field_count] {
 			if i > 0 { print_string(", ") }
 			print_string(name)
 			print_string(": ")
diff --git a/base/runtime/wasm_allocator.odin b/base/runtime/wasm_allocator.odin
index f4b399c47..6bafaa489 100644
--- a/base/runtime/wasm_allocator.odin
+++ b/base/runtime/wasm_allocator.odin
@@ -297,7 +297,8 @@ lock :: proc(a: ^WASM_Allocator) {
 					return
 				}
 
-				assert(intrinsics.wasm_memory_atomic_wait32((^u32)(&a.mu), u32(new_state), -1) != 0)
+				ret := intrinsics.wasm_memory_atomic_wait32((^u32)(&a.mu), u32(new_state), -1)
+				assert(ret != 0)
 				intrinsics.cpu_relax()
 			}
 		}
diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin
index 208949fd8..7cbf092ac 100644
--- a/core/bytes/bytes.odin
+++ b/core/bytes/bytes.odin
@@ -1167,3 +1167,28 @@ fields_proc :: proc(s: []byte, f: proc(rune) -> bool, allocator := context.alloc
 
 	return subslices[:]
 }
+
+// alias returns true iff a and b have a non-zero length, and any part of
+// a overlaps with b.
+alias :: proc "contextless" (a, b: []byte) -> bool {
+	a_len, b_len := len(a), len(b)
+	if a_len == 0 || b_len == 0 {
+		return false
+	}
+
+	a_start, b_start := uintptr(raw_data(a)), uintptr(raw_data(b))
+	a_end, b_end := a_start + uintptr(a_len-1), b_start + uintptr(b_len-1)
+
+	return a_start <= b_end && b_start <= a_end
+}
+
+// alias_inexactly returns true iff a and b have a non-zero length,
+// the base pointer of a and b are NOT equal, and any part of a overlaps
+// with b (ie: `alias(a, b)` with an exception that returns false for
+// `a == b`, `b = a[:len(a)-69]` and similar conditions).
+alias_inexactly :: proc "contextless" (a, b: []byte) -> bool {
+	if raw_data(a) == raw_data(b) {
+		return false
+	}
+	return alias(a, b)
+}
diff --git a/core/compress/zlib/zlib.odin b/core/compress/zlib/zlib.odin
index 005267d15..c7ae9e9c8 100644
--- a/core/compress/zlib/zlib.odin
+++ b/core/compress/zlib/zlib.odin
@@ -235,7 +235,7 @@ allocate_huffman_table :: proc(allocator := context.allocator) -> (z: ^Huffman_T
 }
 
 @(optimization_mode="favor_size")
-build_huffman :: proc(z: ^Huffman_Table, code_lengths: []u8) -> (err: Error) {
+build_huffman :: #force_no_inline proc(z: ^Huffman_Table, code_lengths: []u8) -> (err: Error) {
 	sizes:     [HUFFMAN_MAX_BITS+1]int
 	next_code: [HUFFMAN_MAX_BITS+1]int
 
@@ -670,4 +670,4 @@ inflate_from_byte_array_raw :: proc(input: []u8, buf: ^bytes.Buffer, raw := fals
 	return inflate_raw(&ctx, expected_output_size=expected_output_size)
 }
 
-inflate :: proc{inflate_from_context, inflate_from_byte_array}
+inflate :: proc{inflate_from_context, inflate_from_byte_array}
\ No newline at end of file
diff --git a/core/container/intrusive/list/doc.odin b/core/container/intrusive/list/doc.odin
new file mode 100644
index 000000000..1a5a12f49
--- /dev/null
+++ b/core/container/intrusive/list/doc.odin
@@ -0,0 +1,46 @@
+/*
+Package list implements an intrusive doubly-linked list.
+
+An intrusive container requires a `Node` to be embedded in your own structure, like this:
+
+	My_String :: struct {
+		node:  list.Node,
+		value: string,
+	}
+
+Embedding the members of a `list.Node` in your structure with the `using` keyword is also allowed:
+
+	My_String :: struct {
+		using node: list.Node,
+		value: string,
+	}
+
+Here is a full example:
+
+	package test
+	
+	import "core:fmt"
+	import "core:container/intrusive/list"
+	
+	main :: proc() {
+	    l: list.List
+	
+	    one := My_String{value="Hello"}
+	    two := My_String{value="World"}
+	
+	    list.push_back(&l, &one.node)
+	    list.push_back(&l, &two.node)
+	
+	    iter := list.iterator_head(l, My_String, "node")
+	    for s in list.iterate_next(&iter) {
+	        fmt.println(s.value)
+	    }
+	}
+	
+	My_String :: struct {
+	    node:  list.Node,
+	    value: string,
+	}
+
+*/
+package container_intrusive_list
diff --git a/core/container/intrusive/list/intrusive_list.odin b/core/container/intrusive/list/intrusive_list.odin
index 1a3175002..5b29efb22 100644
--- a/core/container/intrusive/list/intrusive_list.odin
+++ b/core/container/intrusive/list/intrusive_list.odin
@@ -18,11 +18,18 @@ List :: struct {
 	tail: ^Node,
 }
 
-
+// The list link you must include in your own structure.
 Node :: struct {
 	prev, next: ^Node,
 }
 
+/*
+Inserts a new element at the front of the list with O(1) time complexity.
+
+**Inputs**
+- list: The container list
+- node: The node member of the user-defined element structure
+*/
 push_front :: proc "contextless" (list: ^List, node: ^Node) {
 	if list.head != nil {
 		list.head.prev = node
@@ -33,7 +40,13 @@ push_front :: proc "contextless" (list: ^List, node: ^Node) {
 		node.prev, node.next = nil, nil
 	}
 }
+/*
+Inserts a new element at the back of the list with O(1) time complexity.
 
+**Inputs**
+- list: The container list
+- node: The node member of the user-defined element structure
+*/
 push_back :: proc "contextless" (list: ^List, node: ^Node) {
 	if list.tail != nil {
 		list.tail.next = node
@@ -45,6 +58,13 @@ push_back :: proc "contextless" (list: ^List, node: ^Node) {
 	}
 }
 
+/*
+Removes an element from a list with O(1) time complexity.
+
+**Inputs**
+- list: The container list
+- node: The node member of the user-defined element structure to be removed
+*/
 remove :: proc "contextless" (list: ^List, node: ^Node) {
 	if node != nil {
 		if node.next != nil {
@@ -61,7 +81,13 @@ remove :: proc "contextless" (list: ^List, node: ^Node) {
 		}
 	}
 }
+/*
+Removes from the given list all elements that satisfy a condition with O(N) time complexity.
 
+**Inputs**
+- list: The container list
+- to_erase: The condition procedure. It should return `true` if a node should be removed, `false` otherwise
+*/
 remove_by_proc :: proc(list: ^List, to_erase: proc(^Node) -> bool) {
 	for node := list.head; node != nil; {
 		next := node.next
@@ -82,7 +108,13 @@ remove_by_proc :: proc(list: ^List, to_erase: proc(^Node) -> bool) {
 		node = next
 	}
 }
+/*
+Removes from the given list all elements that satisfy a condition with O(N) time complexity.
 
+**Inputs**
+- list: The container list
+- to_erase: The _contextless_ condition procedure. It should return `true` if a node should be removed, `false` otherwise
+*/
 remove_by_proc_contextless :: proc(list: ^List, to_erase: proc "contextless" (^Node) -> bool) {
 	for node := list.head; node != nil; {
 		next := node.next
@@ -104,12 +136,26 @@ remove_by_proc_contextless :: proc(list: ^List, to_erase: proc "contextless" (^N
 	}
 }
 
+/*
+Checks whether the given list does not contain any element.
 
+**Inputs**
+- list: The container list
 
+**Returns** `true` if `list` is empty, `false` otherwise
+*/
 is_empty :: proc "contextless" (list: ^List) -> bool {
 	return list.head == nil
 }
 
+/*
+Removes and returns the element at the front of the list with O(1) time complexity.
+
+**Inputs**
+- list: The container list
+
+**Returns** The node member of the user-defined element structure, or `nil` if the list is empty
+*/
 pop_front :: proc "contextless" (list: ^List) -> ^Node {
 	link := list.head
 	if link == nil {
@@ -130,6 +176,14 @@ pop_front :: proc "contextless" (list: ^List) -> ^Node {
 	return link
 
 }
+/*
+Removes and returns the element at the back of the list with O(1) time complexity.
+
+**Inputs**
+- list: The container list
+
+**Returns** The node member of the user-defined element structure, or `nil` if the list is empty
+*/
 pop_back :: proc "contextless" (list: ^List) -> ^Node {
 	link := list.tail
 	if link == nil {
@@ -151,29 +205,102 @@ pop_back :: proc "contextless" (list: ^List) -> ^Node {
 }
 
 
+
 Iterator :: struct($T: typeid) {
 	curr:   ^Node,
 	offset: uintptr,
 }
 
+/*
+Creates an iterator pointing at the head of the given list. For an example, see `iterate_next`.
+
+**Inputs**
+- list: The container list
+- T: The type of the list's elements
+- field_name: The name of the node field in the `T` structure
+
+**Returns** An iterator pointing at the head of `list`
+
+*/
 iterator_head :: proc "contextless" (list: List, $T: typeid, $field_name: string) -> Iterator(T)
 	where intrinsics.type_has_field(T, field_name),
 	      intrinsics.type_field_type(T, field_name) == Node {
 	return {list.head, offset_of_by_string(T, field_name)}
 }
+/*
+Creates an iterator pointing at the tail of the given list. For an example, see `iterate_prev`.
 
+**Inputs**
+- list: The container list
+- T: The type of the list's elements
+- field_name: The name of the node field in the `T` structure
+
+**Returns** An iterator pointing at the tail of `list`
+
+*/
 iterator_tail :: proc "contextless" (list: List, $T: typeid, $field_name: string) -> Iterator(T)
 	where intrinsics.type_has_field(T, field_name),
 	      intrinsics.type_field_type(T, field_name) == Node {
 	return {list.tail, offset_of_by_string(T, field_name)}
 }
+/*
+Creates an iterator pointing at the specified node of a list.
 
+**Inputs**
+- node: a list node
+- T: The type of the list's elements
+- field_name: The name of the node field in the `T` structure
+
+**Returns** An iterator pointing at `node`
+
+*/
 iterator_from_node :: proc "contextless" (node: ^Node, $T: typeid, $field_name: string) -> Iterator(T)
 	where intrinsics.type_has_field(T, field_name),
 	      intrinsics.type_field_type(T, field_name) == Node {
 	return {node, offset_of_by_string(T, field_name)}
 }
 
+/*
+Retrieves the next element in a list and advances the iterator.
+
+**Inputs**  
+- it: The iterator
+
+**Returns**
+- ptr: The next list element
+- ok: `true` if the element is valid (the iterator could advance), `false` otherwise
+
+Example:
+
+	import "core:fmt"
+	import "core:container/intrusive/list"
+
+	iterate_next_example :: proc() {
+		l: list.List
+
+		one := My_Struct{value=1}
+		two := My_Struct{value=2}
+
+		list.push_back(&l, &one.node)
+		list.push_back(&l, &two.node)
+
+		it := list.iterator_head(l, My_Struct, "node")
+		for num in list.iterate_next(&it) {
+			fmt.println(num.value)
+		}
+	}
+
+	My_Struct :: struct {
+		node : list.Node,
+		value: int,
+	}
+
+Output:
+
+	1
+	2
+
+*/
 iterate_next :: proc "contextless" (it: ^Iterator($T)) -> (ptr: ^T, ok: bool) {
 	node := it.curr
 	if node == nil {
@@ -183,7 +310,47 @@ iterate_next :: proc "contextless" (it: ^Iterator($T)) -> (ptr: ^T, ok: bool) {
 
 	return (^T)(uintptr(node) - it.offset), true
 }
+/*
+Retrieves the previous element in a list and recede the iterator.
 
+**Inputs**  
+- it: The iterator
+
+**Returns**
+- ptr: The previous list element
+- ok: `true` if the element is valid (the iterator could recede), `false` otherwise
+
+Example:
+
+	import "core:fmt"
+	import "core:container/intrusive/list"
+
+	iterate_next_example :: proc() {
+		l: list.List
+
+		one := My_Struct{value=1}
+		two := My_Struct{value=2}
+
+		list.push_back(&l, &one.node)
+		list.push_back(&l, &two.node)
+
+		it := list.iterator_tail(l, My_Struct, "node")
+		for num in list.iterate_prev(&it) {
+			fmt.println(num.value)
+		}
+	}
+
+	My_Struct :: struct {
+		node : list.Node,
+		value: int,
+	}
+
+Output:
+
+	2
+	1
+
+*/
 iterate_prev :: proc "contextless" (it: ^Iterator($T)) -> (ptr: ^T, ok: bool) {
 	node := it.curr
 	if node == nil {
@@ -192,4 +359,4 @@ iterate_prev :: proc "contextless" (it: ^Iterator($T)) -> (ptr: ^T, ok: bool) {
 	it.curr = node.prev
 
 	return (^T)(uintptr(node) - it.offset), true
-}
\ No newline at end of file
+}
diff --git a/core/container/queue/queue.odin b/core/container/queue/queue.odin
index e7a60dde0..f83a5f2b7 100644
--- a/core/container/queue/queue.odin
+++ b/core/container/queue/queue.odin
@@ -95,11 +95,11 @@ front_ptr :: proc(q: ^$Q/Queue($T)) -> ^T {
 }
 
 back :: proc(q: ^$Q/Queue($T)) -> T {
-	idx := (q.offset+uint(q.len))%builtin.len(q.data)
+	idx := (q.offset+uint(q.len - 1))%builtin.len(q.data)
 	return q.data[idx]
 }
 back_ptr :: proc(q: ^$Q/Queue($T)) -> ^T {
-	idx := (q.offset+uint(q.len))%builtin.len(q.data)
+	idx := (q.offset+uint(q.len - 1))%builtin.len(q.data)
 	return &q.data[idx]
 }
 
diff --git a/core/crypto/_aes/ct64/api.odin b/core/crypto/_aes/ct64/api.odin
index ae624971c..f57a630b1 100644
--- a/core/crypto/_aes/ct64/api.odin
+++ b/core/crypto/_aes/ct64/api.odin
@@ -7,9 +7,8 @@ STRIDE :: 4
 
 // Context is a keyed AES (ECB) instance.
 Context :: struct {
-	_sk_exp:         [120]u64,
-	_num_rounds:     int,
-	_is_initialized: bool,
+	_sk_exp:     [120]u64,
+	_num_rounds: int,
 }
 
 // init initializes a context for AES with the provided key.
@@ -18,13 +17,10 @@ init :: proc(ctx: ^Context, key: []byte) {
 
 	ctx._num_rounds = keysched(skey[:], key)
 	skey_expand(ctx._sk_exp[:], skey[:], ctx._num_rounds)
-	ctx._is_initialized = true
 }
 
 // encrypt_block sets `dst` to `AES-ECB-Encrypt(src)`.
 encrypt_block :: proc(ctx: ^Context, dst, src: []byte) {
-	assert(ctx._is_initialized)
-
 	q: [8]u64
 	load_blockx1(&q, src)
 	_encrypt(&q, ctx._sk_exp[:], ctx._num_rounds)
@@ -33,8 +29,6 @@ encrypt_block :: proc(ctx: ^Context, dst, src: []byte) {
 
 // encrypt_block sets `dst` to `AES-ECB-Decrypt(src)`.
 decrypt_block :: proc(ctx: ^Context, dst, src: []byte) {
-	assert(ctx._is_initialized)
-
 	q: [8]u64
 	load_blockx1(&q, src)
 	_decrypt(&q, ctx._sk_exp[:], ctx._num_rounds)
@@ -43,8 +37,6 @@ decrypt_block :: proc(ctx: ^Context, dst, src: []byte) {
 
 // encrypt_blocks sets `dst` to `AES-ECB-Encrypt(src[0], .. src[n])`.
 encrypt_blocks :: proc(ctx: ^Context, dst, src: [][]byte) {
-	assert(ctx._is_initialized)
-
 	q: [8]u64 = ---
 	src, dst := src, dst
 
@@ -67,8 +59,6 @@ encrypt_blocks :: proc(ctx: ^Context, dst, src: [][]byte) {
 
 // decrypt_blocks sets dst to `AES-ECB-Decrypt(src[0], .. src[n])`.
 decrypt_blocks :: proc(ctx: ^Context, dst, src: [][]byte) {
-	assert(ctx._is_initialized)
-
 	q: [8]u64 = ---
 	src, dst := src, dst
 
diff --git a/core/crypto/_aes/hw_intel/api.odin b/core/crypto/_aes/hw_intel/api.odin
new file mode 100644
index 000000000..5cb5a68bb
--- /dev/null
+++ b/core/crypto/_aes/hw_intel/api.odin
@@ -0,0 +1,43 @@
+//+build amd64
+package aes_hw_intel
+
+import "core:sys/info"
+
+// is_supporte returns true iff hardware accelerated AES
+// is supported.
+is_supported :: proc "contextless" () -> bool {
+	features, ok := info.cpu_features.?
+	if !ok {
+		return false
+	}
+
+	// Note: Everything with AES-NI and PCLMULQDQ has support for
+	// the required SSE extxtensions.
+	req_features :: info.CPU_Features{
+		.sse2,
+		.ssse3,
+		.sse41,
+		.aes,
+		.pclmulqdq,
+	}
+	return features >= req_features
+}
+
+// Context is a keyed AES (ECB) instance.
+Context :: struct {
+	// Note: The ideal thing to do is for the expanded round keys to be
+	// arrays of `__m128i`, however that implies alignment (or using AVX).
+	//
+	// All the people using e-waste processors that don't support an
+	// insturction set that has been around for over 10 years are why
+	// we can't have nice things.
+	_sk_exp_enc: [15][16]byte,
+	_sk_exp_dec: [15][16]byte,
+	_num_rounds: int,
+}
+
+// init initializes a context for AES with the provided key.
+init :: proc(ctx: ^Context, key: []byte) {
+	keysched(ctx, key)
+}
+
diff --git a/core/crypto/_aes/hw_intel/ghash.odin b/core/crypto/_aes/hw_intel/ghash.odin
new file mode 100644
index 000000000..9a5208523
--- /dev/null
+++ b/core/crypto/_aes/hw_intel/ghash.odin
@@ -0,0 +1,281 @@
+// Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+//   1. Redistributions of source code must retain the above copyright
+//      notice, this list of conditions and the following disclaimer.
+//
+// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR
+// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
+// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//+build amd64
+package aes_hw_intel
+
+import "base:intrinsics"
+import "core:crypto/_aes"
+import "core:simd"
+import "core:simd/x86"
+
+@(private = "file")
+GHASH_STRIDE_HW :: 4
+@(private = "file")
+GHASH_STRIDE_BYTES_HW :: GHASH_STRIDE_HW * _aes.GHASH_BLOCK_SIZE
+
+// GHASH is defined over elements of GF(2^128) with "full little-endian"
+// representation: leftmost byte is least significant, and, within each
+// byte, leftmost _bit_ is least significant. The natural ordering in
+// x86 is "mixed little-endian": bytes are ordered from least to most
+// significant, but bits within a byte are in most-to-least significant
+// order. Going to full little-endian representation would require
+// reversing bits within each byte, which is doable but expensive.
+//
+// Instead, we go to full big-endian representation, by swapping bytes
+// around, which is done with a single _mm_shuffle_epi8() opcode (it
+// comes with SSSE3; all CPU that offer pclmulqdq also have SSSE3). We
+// can use a full big-endian representation because in a carryless
+// multiplication, we have a nice bit reversal property:
+//
+// rev_128(x) * rev_128(y) = rev_255(x * y)
+//
+// So by using full big-endian, we still get the right result, except
+// that it is right-shifted by 1 bit. The left-shift is relatively
+// inexpensive, and it can be mutualised.
+//
+// Since SSE2 opcodes do not have facilities for shitfting full 128-bit
+// values with bit precision, we have to break down values into 64-bit
+// chunks. We number chunks from 0 to 3 in left to right order.
+
+@(private = "file")
+byteswap_index := transmute(x86.__m128i)simd.i8x16{
+	// Note: simd.i8x16 is reverse order from x86._mm_set_epi8.
+	15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+}
+
+@(private = "file", require_results, enable_target_feature = "sse2,ssse3")
+byteswap :: #force_inline proc "contextless" (x: x86.__m128i) -> x86.__m128i {
+	return x86._mm_shuffle_epi8(x, byteswap_index)
+}
+
+// From a 128-bit value kw, compute kx as the XOR of the two 64-bit
+// halves of kw (into the right half of kx; left half is unspecified),
+// and return kx.
+@(private = "file", require_results, enable_target_feature = "sse2")
+bk :: #force_inline proc "contextless" (kw: x86.__m128i) -> x86.__m128i {
+	return x86._mm_xor_si128(kw, x86._mm_shuffle_epi32(kw, 0x0e))
+}
+
+// Combine two 64-bit values (k0:k1) into a 128-bit (kw) value and
+// the XOR of the two values (kx), and return (kw, kx).
+@(private = "file", enable_target_feature = "sse2")
+pbk :: #force_inline proc "contextless" (k0, k1: x86.__m128i) -> (x86.__m128i, x86.__m128i) {
+	kw := x86._mm_unpacklo_epi64(k1, k0)
+	kx := x86._mm_xor_si128(k0, k1)
+	return kw, kx
+}
+
+// Left-shift by 1 bit a 256-bit value (in four 64-bit words).
+@(private = "file", require_results, enable_target_feature = "sse2")
+sl_256 :: #force_inline proc "contextless" (x0, x1, x2, x3: x86.__m128i) -> (x86.__m128i, x86.__m128i, x86.__m128i, x86.__m128i) {
+	x0, x1, x2, x3 := x0, x1, x2, x3
+
+	x0 = x86._mm_or_si128(x86._mm_slli_epi64(x0, 1), x86._mm_srli_epi64(x1, 63))
+	x1 = x86._mm_or_si128(x86._mm_slli_epi64(x1, 1), x86._mm_srli_epi64(x2, 63))
+	x2 = x86._mm_or_si128(x86._mm_slli_epi64(x2, 1), x86._mm_srli_epi64(x3, 63))
+	x3 = x86._mm_slli_epi64(x3, 1)
+
+	return x0, x1, x2, x3
+}
+
+// Perform reduction in GF(2^128).
+@(private = "file", require_results, enable_target_feature = "sse2")
+reduce_f128 :: #force_inline proc "contextless" (x0, x1, x2, x3: x86.__m128i) -> (x86.__m128i, x86.__m128i) {
+	x0, x1, x2 := x0, x1, x2
+
+	x1 = x86._mm_xor_si128(
+		x1,
+		x86._mm_xor_si128(
+			x86._mm_xor_si128(
+				x3,
+				x86._mm_srli_epi64(x3, 1)),
+			x86._mm_xor_si128(
+				x86._mm_srli_epi64(x3, 2),
+				x86._mm_srli_epi64(x3, 7))))
+	x2 = x86._mm_xor_si128(
+		x86._mm_xor_si128(
+			x2,
+			x86._mm_slli_epi64(x3, 63)),
+		x86._mm_xor_si128(
+			x86._mm_slli_epi64(x3, 62),
+			x86._mm_slli_epi64(x3, 57)))
+	x0 = x86._mm_xor_si128(
+		x0,
+		x86._mm_xor_si128(
+			x86._mm_xor_si128(
+				x2,
+				x86._mm_srli_epi64(x2, 1)),
+			x86._mm_xor_si128(
+				x86._mm_srli_epi64(x2, 2),
+				x86._mm_srli_epi64(x2, 7))))
+	x1 = x86._mm_xor_si128(
+		x86._mm_xor_si128(
+			x1,
+			x86._mm_slli_epi64(x2, 63)),
+		x86._mm_xor_si128(
+			x86._mm_slli_epi64(x2, 62),
+			x86._mm_slli_epi64(x2, 57)))
+
+	return x0, x1
+}
+
+// Square value kw in GF(2^128) into (dw,dx).
+@(private = "file", require_results, enable_target_feature = "sse2,pclmul")
+square_f128 :: #force_inline proc "contextless" (kw: x86.__m128i) -> (x86.__m128i, x86.__m128i) {
+	z1 := x86._mm_clmulepi64_si128(kw, kw, 0x11)
+	z3 := x86._mm_clmulepi64_si128(kw, kw, 0x00)
+	z0 := x86._mm_shuffle_epi32(z1, 0x0E)
+	z2 := x86._mm_shuffle_epi32(z3, 0x0E)
+	z0, z1, z2, z3 = sl_256(z0, z1, z2, z3)
+	z0, z1 = reduce_f128(z0, z1, z2, z3)
+	return pbk(z0, z1)
+}
+
+// ghash calculates the GHASH of data, with the key `key`, and input `dst`
+// and `data`, and stores the resulting digest in `dst`.
+//
+// Note: `dst` is both an input and an output, to support easy implementation
+// of GCM.
+@(enable_target_feature = "sse2,ssse3,pclmul")
+ghash :: proc "contextless" (dst, key, data: []byte) #no_bounds_check {
+	if len(dst) != _aes.GHASH_BLOCK_SIZE || len(key) != _aes.GHASH_BLOCK_SIZE {
+		intrinsics.trap()
+	}
+
+	// Note: BearSSL opts to copy the remainder into a zero-filled
+	// 64-byte buffer.  We do something slightly more simple.
+
+	// Load key and dst (h and y).
+	yw := intrinsics.unaligned_load((^x86.__m128i)(raw_data(dst)))
+	h1w := intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
+	yw = byteswap(yw)
+	h1w = byteswap(h1w)
+	h1x := bk(h1w)
+
+	// Process 4 blocks at a time
+	buf := data
+	l := len(buf)
+	if l >= GHASH_STRIDE_BYTES_HW {
+		// Compute h2 = h^2
+		h2w, h2x := square_f128(h1w)
+
+		// Compute h3 = h^3 = h*(h^2)
+		t1 := x86._mm_clmulepi64_si128(h1w, h2w, 0x11)
+		t3 := x86._mm_clmulepi64_si128(h1w, h2w, 0x00)
+		t2 := x86._mm_xor_si128(
+			x86._mm_clmulepi64_si128(h1x, h2x, 0x00),
+			x86._mm_xor_si128(t1, t3))
+		t0 := x86._mm_shuffle_epi32(t1, 0x0E)
+		t1 = x86._mm_xor_si128(t1, x86._mm_shuffle_epi32(t2, 0x0E))
+		t2 = x86._mm_xor_si128(t2, x86._mm_shuffle_epi32(t3, 0x0E))
+		t0, t1, t2, t3 = sl_256(t0, t1, t2, t3)
+		t0, t1 = reduce_f128(t0, t1, t2, t3)
+		h3w, h3x := pbk(t0, t1)
+
+		// Compute h4 = h^4 = (h^2)^2
+		h4w, h4x := square_f128(h2w)
+
+		for l >= GHASH_STRIDE_BYTES_HW {
+			aw0 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(buf)))
+			aw1 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(buf[16:])))
+			aw2 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(buf[32:])))
+			aw3 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(buf[48:])))
+			aw0 = byteswap(aw0)
+			aw1 = byteswap(aw1)
+			aw2 = byteswap(aw2)
+			aw3 = byteswap(aw3)
+			buf, l = buf[GHASH_STRIDE_BYTES_HW:], l - GHASH_STRIDE_BYTES_HW
+
+			aw0 = x86._mm_xor_si128(aw0, yw)
+			ax1 := bk(aw1)
+			ax2 := bk(aw2)
+			ax3 := bk(aw3)
+			ax0 := bk(aw0)
+
+			t1 = x86._mm_xor_si128(
+				x86._mm_xor_si128(
+					x86._mm_clmulepi64_si128(aw0, h4w, 0x11),
+					x86._mm_clmulepi64_si128(aw1, h3w, 0x11)),
+				x86._mm_xor_si128(
+					x86._mm_clmulepi64_si128(aw2, h2w, 0x11),
+					x86._mm_clmulepi64_si128(aw3, h1w, 0x11)))
+			t3 = x86._mm_xor_si128(
+				x86._mm_xor_si128(
+					x86._mm_clmulepi64_si128(aw0, h4w, 0x00),
+					x86._mm_clmulepi64_si128(aw1, h3w, 0x00)),
+				x86._mm_xor_si128(
+					x86._mm_clmulepi64_si128(aw2, h2w, 0x00),
+					x86._mm_clmulepi64_si128(aw3, h1w, 0x00)))
+			t2 = x86._mm_xor_si128(
+				x86._mm_xor_si128(
+					x86._mm_clmulepi64_si128(ax0, h4x, 0x00),
+					x86._mm_clmulepi64_si128(ax1, h3x, 0x00)),
+				x86._mm_xor_si128(
+					x86._mm_clmulepi64_si128(ax2, h2x, 0x00),
+					x86._mm_clmulepi64_si128(ax3, h1x, 0x00)))
+			t2 = x86._mm_xor_si128(t2, x86._mm_xor_si128(t1, t3))
+			t0 = x86._mm_shuffle_epi32(t1, 0x0E)
+			t1 = x86._mm_xor_si128(t1, x86._mm_shuffle_epi32(t2, 0x0E))
+			t2 = x86._mm_xor_si128(t2, x86._mm_shuffle_epi32(t3, 0x0E))
+			t0, t1, t2, t3 = sl_256(t0, t1, t2, t3)
+			t0, t1 = reduce_f128(t0, t1, t2, t3)
+			yw = x86._mm_unpacklo_epi64(t1, t0)
+		}
+	}
+
+	// Process 1 block at a time
+	src: []byte
+	for l > 0 {
+		if l >= _aes.GHASH_BLOCK_SIZE {
+			src = buf
+			buf = buf[_aes.GHASH_BLOCK_SIZE:]
+			l -= _aes.GHASH_BLOCK_SIZE
+		} else {
+			tmp: [_aes.GHASH_BLOCK_SIZE]byte
+			copy(tmp[:], buf)
+			src = tmp[:]
+			l = 0
+		}
+
+		aw := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src)))
+		aw = byteswap(aw)
+
+		aw = x86._mm_xor_si128(aw, yw)
+		ax := bk(aw)
+
+		t1 := x86._mm_clmulepi64_si128(aw, h1w, 0x11)
+		t3 := x86._mm_clmulepi64_si128(aw, h1w, 0x00)
+		t2 := x86._mm_clmulepi64_si128(ax, h1x, 0x00)
+		t2 = x86._mm_xor_si128(t2, x86._mm_xor_si128(t1, t3))
+		t0 := x86._mm_shuffle_epi32(t1, 0x0E)
+		t1 = x86._mm_xor_si128(t1, x86._mm_shuffle_epi32(t2, 0x0E))
+		t2 = x86._mm_xor_si128(t2, x86._mm_shuffle_epi32(t3, 0x0E))
+		t0, t1, t2, t3 = sl_256(t0, t1, t2, t3)
+		t0, t1 = reduce_f128(t0, t1, t2, t3)
+		yw = x86._mm_unpacklo_epi64(t1, t0)
+	}
+
+	// Write back the hash (dst, aka y)
+	yw = byteswap(yw)
+	intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), yw)
+}
diff --git a/core/crypto/_aes/hw_intel/hw_intel_keysched.odin b/core/crypto/_aes/hw_intel/hw_intel_keysched.odin
new file mode 100644
index 000000000..911dffbd5
--- /dev/null
+++ b/core/crypto/_aes/hw_intel/hw_intel_keysched.odin
@@ -0,0 +1,178 @@
+// Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+//   1. Redistributions of source code must retain the above copyright
+//      notice, this list of conditions and the following disclaimer.
+//
+// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR
+// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
+// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//+build amd64
+package aes_hw_intel
+
+import "base:intrinsics"
+import "core:crypto/_aes"
+import "core:mem"
+import "core:simd/x86"
+
+// Intel AES-NI based implementation.  Inspiration taken from BearSSL.
+//
+// Note: This assumes that the SROA optimization pass is enabled to be
+// anything resembling performat otherwise, LLVM will not elide a massive
+// number of redundant loads/stores it generates for every intrinsic call.
+
+@(private = "file", require_results, enable_target_feature = "sse2")
+expand_step128 :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i {
+	k1, k2 := k1, k2
+
+	k2 = x86._mm_shuffle_epi32(k2, 0xff)
+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
+	return x86._mm_xor_si128(k1, k2)
+}
+
+@(private = "file", require_results, enable_target_feature = "sse,sse2")
+expand_step192a :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> (x86.__m128i, x86.__m128i) {
+	k1, k2, k3 := k1_^, k2_^, k3
+
+	k3 = x86._mm_shuffle_epi32(k3, 0x55)
+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
+	k1 = x86._mm_xor_si128(k1, k3)
+
+	tmp := k2
+	k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04))
+	k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff))
+
+	k1_, k2_ := k1_, k2_
+	k1_^, k2_^ = k1, k2
+
+	r1 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(tmp), transmute(x86.__m128)(k1), 0x44))
+	r2 := transmute(x86.__m128i)(x86._mm_shuffle_ps(transmute(x86.__m128)(k1), transmute(x86.__m128)(k2), 0x4e))
+
+	return r1, r2
+}
+
+@(private = "file", require_results, enable_target_feature = "sse2")
+expand_step192b :: #force_inline proc (k1_, k2_: ^x86.__m128i, k3: x86.__m128i) -> x86.__m128i {
+	k1, k2, k3 := k1_^, k2_^, k3
+
+	k3 = x86._mm_shuffle_epi32(k3, 0x55)
+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
+	k1 = x86._mm_xor_si128(k1, k3)
+
+	k2 = x86._mm_xor_si128(k2, x86._mm_slli_si128(k2, 0x04))
+	k2 = x86._mm_xor_si128(k2, x86._mm_shuffle_epi32(k1, 0xff))
+
+	k1_, k2_ := k1_, k2_
+	k1_^, k2_^ = k1, k2
+
+	return k1
+}
+
+@(private = "file", require_results, enable_target_feature = "sse2")
+expand_step256b :: #force_inline proc(k1, k2: x86.__m128i) -> x86.__m128i {
+	k1, k2 := k1, k2
+
+	k2 = x86._mm_shuffle_epi32(k2, 0xaa)
+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
+	k1 = x86._mm_xor_si128(k1, x86._mm_slli_si128(k1, 0x04))
+	return x86._mm_xor_si128(k1, k2)
+}
+
+@(private = "file", enable_target_feature = "aes")
+derive_dec_keys :: proc(ctx: ^Context, sks: ^[15]x86.__m128i, num_rounds: int) {
+	intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[0]), sks[num_rounds])
+	for i in 1 ..< num_rounds {
+		tmp := x86._mm_aesimc_si128(sks[i])
+		intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds - i]), tmp)
+	}
+	intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_dec[num_rounds]), sks[0])
+}
+
+@(private, enable_target_feature = "sse,sse2,aes")
+keysched :: proc(ctx: ^Context, key: []byte) {
+	sks: [15]x86.__m128i = ---
+
+	// Compute the encryption keys.
+	num_rounds, key_len := 0, len(key)
+	switch key_len {
+	case _aes.KEY_SIZE_128:
+		sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
+		sks[1] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[0], 0x01))
+		sks[2] = expand_step128(sks[1], x86._mm_aeskeygenassist_si128(sks[1], 0x02))
+		sks[3] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[2], 0x04))
+		sks[4] = expand_step128(sks[3], x86._mm_aeskeygenassist_si128(sks[3], 0x08))
+		sks[5] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[4], 0x10))
+		sks[6] = expand_step128(sks[5], x86._mm_aeskeygenassist_si128(sks[5], 0x20))
+		sks[7] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[6], 0x40))
+		sks[8] = expand_step128(sks[7], x86._mm_aeskeygenassist_si128(sks[7], 0x80))
+		sks[9] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[8], 0x1b))
+		sks[10] = expand_step128(sks[9], x86._mm_aeskeygenassist_si128(sks[9], 0x36))
+		num_rounds = _aes.ROUNDS_128
+	case _aes.KEY_SIZE_192:
+		k0 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
+		k1 := x86.__m128i{
+			intrinsics.unaligned_load((^i64)(raw_data(key[16:]))),
+			0,
+		}
+		sks[0] = k0
+		sks[1], sks[2] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x01))
+		sks[3] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x02))
+		sks[4], sks[5] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x04))
+		sks[6] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x08))
+		sks[7], sks[8] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x10))
+		sks[9] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x20))
+		sks[10], sks[11] = expand_step192a(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x40))
+		sks[12] = expand_step192b(&k0, &k1, x86._mm_aeskeygenassist_si128(k1, 0x80))
+		num_rounds = _aes.ROUNDS_192
+	case _aes.KEY_SIZE_256:
+		sks[0] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key)))
+		sks[1] = intrinsics.unaligned_load((^x86.__m128i)(raw_data(key[16:])))
+		sks[2] = expand_step128(sks[0], x86._mm_aeskeygenassist_si128(sks[1], 0x01))
+		sks[3] = expand_step256b(sks[1], x86._mm_aeskeygenassist_si128(sks[2], 0x01))
+		sks[4] = expand_step128(sks[2], x86._mm_aeskeygenassist_si128(sks[3], 0x02))
+		sks[5] = expand_step256b(sks[3], x86._mm_aeskeygenassist_si128(sks[4], 0x02))
+		sks[6] = expand_step128(sks[4], x86._mm_aeskeygenassist_si128(sks[5], 0x04))
+		sks[7] = expand_step256b(sks[5], x86._mm_aeskeygenassist_si128(sks[6], 0x04))
+		sks[8] = expand_step128(sks[6], x86._mm_aeskeygenassist_si128(sks[7], 0x08))
+		sks[9] = expand_step256b(sks[7], x86._mm_aeskeygenassist_si128(sks[8], 0x08))
+		sks[10] = expand_step128(sks[8], x86._mm_aeskeygenassist_si128(sks[9], 0x10))
+		sks[11] = expand_step256b(sks[9], x86._mm_aeskeygenassist_si128(sks[10], 0x10))
+		sks[12] = expand_step128(sks[10], x86._mm_aeskeygenassist_si128(sks[11], 0x20))
+		sks[13] = expand_step256b(sks[11], x86._mm_aeskeygenassist_si128(sks[12], 0x20))
+		sks[14] = expand_step128(sks[12], x86._mm_aeskeygenassist_si128(sks[13], 0x40))
+		num_rounds = _aes.ROUNDS_256
+	case:
+		panic("crypto/aes: invalid AES key size")
+	}
+	for i in 0 ..= num_rounds {
+		intrinsics.unaligned_store((^x86.__m128i)(&ctx._sk_exp_enc[i]), sks[i])
+	}
+
+	// Compute the decryption keys.  GCM and CTR do not need this, however
+	// ECB, CBC, OCB3, etc do.
+	derive_dec_keys(ctx, &sks, num_rounds)
+
+	ctx._num_rounds = num_rounds
+
+	mem.zero_explicit(&sks, size_of(sks))
+}
diff --git a/core/crypto/aes/aes.odin b/core/crypto/aes/aes.odin
index e895c5fe0..ef305fd21 100644
--- a/core/crypto/aes/aes.odin
+++ b/core/crypto/aes/aes.odin
@@ -6,7 +6,6 @@ See:
 - https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38a.pdf
 - https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38d.pdf
 */
-
 package aes
 
 import "core:crypto/_aes"
diff --git a/core/crypto/aes/aes_ctr.odin b/core/crypto/aes/aes_ctr.odin
index 1821a7bdf..1c5fe31e8 100644
--- a/core/crypto/aes/aes_ctr.odin
+++ b/core/crypto/aes/aes_ctr.odin
@@ -1,5 +1,6 @@
 package aes
 
+import "core:bytes"
 import "core:crypto/_aes/ct64"
 import "core:encoding/endian"
 import "core:math/bits"
@@ -37,14 +38,15 @@ init_ctr :: proc(ctx: ^Context_CTR, key, iv: []byte, impl := Implementation.Hard
 xor_bytes_ctr :: proc(ctx: ^Context_CTR, dst, src: []byte) {
 	assert(ctx._is_initialized)
 
-	// TODO: Enforcing that dst and src alias exactly or not at all
-	// is a good idea, though odd aliasing should be extremely uncommon.
-
 	src, dst := src, dst
 	if dst_len := len(dst); dst_len < len(src) {
 		src = src[:dst_len]
 	}
 
+	if bytes.alias_inexactly(dst, src) {
+		panic("crypto/aes: dst and src alias inexactly")
+	}
+
 	for remaining := len(src); remaining > 0; {
 		// Process multiple blocks at once
 		if ctx._off == BLOCK_SIZE {
@@ -123,8 +125,8 @@ reset_ctr :: proc "contextless" (ctx: ^Context_CTR) {
 	ctx._is_initialized = false
 }
 
-@(private)
-ctr_blocks :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) {
+@(private = "file")
+ctr_blocks :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_bounds_check {
 	// Use the optimized hardware implementation if available.
 	if _, is_hw := ctx._impl.(Context_Impl_Hardware); is_hw {
 		ctr_blocks_hw(ctx, dst, src, nr_blocks)
@@ -183,17 +185,17 @@ xor_blocks :: #force_inline proc "contextless" (dst, src: []byte, blocks: [][]by
 	// performance of this implementation matters to where that
 	// optimization would be worth it, use chacha20poly1305, or a
 	// CPU that isn't e-waste.
-	if src != nil {
-		#no_bounds_check {
-			for i in 0 ..< len(blocks) {
-				off := i * BLOCK_SIZE
-				for j in 0 ..< BLOCK_SIZE {
-					blocks[i][j] ~= src[off + j]
+	#no_bounds_check {
+		if src != nil {
+				for i in 0 ..< len(blocks) {
+					off := i * BLOCK_SIZE
+					for j in 0 ..< BLOCK_SIZE {
+						blocks[i][j] ~= src[off + j]
+					}
 				}
-			}
+		}
+		for i in 0 ..< len(blocks) {
+			copy(dst[i * BLOCK_SIZE:], blocks[i])
 		}
 	}
-	for i in 0 ..< len(blocks) {
-		copy(dst[i * BLOCK_SIZE:], blocks[i])
-	}
 }
diff --git a/core/crypto/aes/aes_ctr_hw_intel.odin b/core/crypto/aes/aes_ctr_hw_intel.odin
new file mode 100644
index 000000000..1c9e815ad
--- /dev/null
+++ b/core/crypto/aes/aes_ctr_hw_intel.odin
@@ -0,0 +1,151 @@
+//+build amd64
+package aes
+
+import "base:intrinsics"
+import "core:crypto/_aes"
+import "core:math/bits"
+import "core:mem"
+import "core:simd/x86"
+
+@(private)
+CTR_STRIDE_HW :: 4
+@(private)
+CTR_STRIDE_BYTES_HW :: CTR_STRIDE_HW * BLOCK_SIZE
+
+@(private, enable_target_feature = "sse2,aes")
+ctr_blocks_hw :: proc(ctx: ^Context_CTR, dst, src: []byte, nr_blocks: int) #no_bounds_check {
+	hw_ctx := ctx._impl.(Context_Impl_Hardware)
+
+	sks: [15]x86.__m128i = ---
+	for i in 0 ..= hw_ctx._num_rounds {
+		sks[i] = intrinsics.unaligned_load((^x86.__m128i)(&hw_ctx._sk_exp_enc[i]))
+	}
+
+	hw_inc_ctr := #force_inline proc "contextless" (hi, lo: u64) -> (x86.__m128i, u64, u64) {
+		ret := x86.__m128i{
+			i64(intrinsics.byte_swap(hi)),
+			i64(intrinsics.byte_swap(lo)),
+		}
+
+		hi, lo := hi, lo
+		carry: u64
+
+		lo, carry = bits.add_u64(lo, 1, 0)
+		hi, _ = bits.add_u64(hi, 0, carry)
+		return ret, hi, lo
+	}
+
+	// The latency of AESENC depends on mfg and microarchitecture:
+	// - 7 -> up to Broadwell
+	// - 4 -> AMD and Skylake - Cascade Lake
+	// - 3 -> Ice Lake and newer
+	//
+	// This implementation does 4 blocks at once, since performance
+	// should be "adequate" across most CPUs.
+
+	src, dst := src, dst
+	nr_blocks := nr_blocks
+	ctr_hi, ctr_lo := ctx._ctr_hi, ctx._ctr_lo
+
+	blks: [CTR_STRIDE_HW]x86.__m128i = ---
+	for nr_blocks >= CTR_STRIDE_HW {
+		#unroll for i in 0..< CTR_STRIDE_HW {
+			blks[i], ctr_hi, ctr_lo = hw_inc_ctr(ctr_hi, ctr_lo)
+		}
+
+		#unroll for i in 0 ..< CTR_STRIDE_HW {
+			blks[i] = x86._mm_xor_si128(blks[i], sks[0])
+		}
+		#unroll for i in 1 ..= 9 {
+			#unroll for j in 0 ..< CTR_STRIDE_HW {
+				blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
+			}
+		}
+		switch hw_ctx._num_rounds {
+		case _aes.ROUNDS_128:
+			#unroll for i in 0 ..< CTR_STRIDE_HW {
+				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[10])
+			}
+		case _aes.ROUNDS_192:
+			#unroll for i in 10 ..= 11 {
+				#unroll for j in 0 ..< CTR_STRIDE_HW {
+					blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
+				}
+			}
+			#unroll for i in 0 ..< CTR_STRIDE_HW {
+				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[12])
+			}
+		case _aes.ROUNDS_256:
+			#unroll for i in 10 ..= 13 {
+				#unroll for j in 0 ..< CTR_STRIDE_HW {
+					blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
+				}
+			}
+			#unroll for i in 0 ..< CTR_STRIDE_HW {
+				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[14])
+			}
+		}
+
+		xor_blocks_hw(dst, src, blks[:])
+
+		if src != nil {
+			src = src[CTR_STRIDE_BYTES_HW:]
+		}
+		dst = dst[CTR_STRIDE_BYTES_HW:]
+		nr_blocks -= CTR_STRIDE_HW
+	}
+
+	// Handle the remainder.
+	for nr_blocks > 0 {
+		blks[0], ctr_hi, ctr_lo = hw_inc_ctr(ctr_hi, ctr_lo)
+
+		blks[0] = x86._mm_xor_si128(blks[0], sks[0])
+		#unroll for i in 1 ..= 9 {
+			blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
+		}
+		switch hw_ctx._num_rounds {
+		case _aes.ROUNDS_128:
+			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[10])
+		case _aes.ROUNDS_192:
+			#unroll for i in 10 ..= 11 {
+				blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
+			}
+			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[12])
+		case _aes.ROUNDS_256:
+			#unroll for i in 10 ..= 13 {
+				blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
+			}
+			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[14])
+		}
+
+		xor_blocks_hw(dst, src, blks[:1])
+
+		if src != nil {
+			src = src[BLOCK_SIZE:]
+		}
+		dst = dst[BLOCK_SIZE:]
+		nr_blocks -= 1
+	}
+
+	// Write back the counter.
+	ctx._ctr_hi, ctx._ctr_lo = ctr_hi, ctr_lo
+
+	mem.zero_explicit(&blks, size_of(blks))
+	mem.zero_explicit(&sks, size_of(sks))
+}
+
+@(private, enable_target_feature = "sse2")
+xor_blocks_hw :: proc(dst, src: []byte, blocks: []x86.__m128i) {
+	#no_bounds_check {
+		if src != nil {
+				for i in 0 ..< len(blocks) {
+					off := i * BLOCK_SIZE
+					tmp := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[off:])))
+					blocks[i] = x86._mm_xor_si128(blocks[i], tmp)
+				}
+		}
+		for i in 0 ..< len(blocks) {
+			intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst[i * BLOCK_SIZE:])), blocks[i])
+		}
+	}
+}
diff --git a/core/crypto/aes/aes_ecb_hw_intel.odin b/core/crypto/aes/aes_ecb_hw_intel.odin
new file mode 100644
index 000000000..b2ff36a0c
--- /dev/null
+++ b/core/crypto/aes/aes_ecb_hw_intel.odin
@@ -0,0 +1,58 @@
+//+build amd64
+package aes
+
+import "base:intrinsics"
+import "core:crypto/_aes"
+import "core:simd/x86"
+
+@(private, enable_target_feature = "sse2,aes")
+encrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
+	blk := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src)))
+
+	blk = x86._mm_xor_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[0])))
+	#unroll for i in 1 ..= 9 {
+		blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])))
+	}
+	switch ctx._num_rounds {
+	case _aes.ROUNDS_128:
+		blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[10])))
+	case _aes.ROUNDS_192:
+		#unroll for i in 10 ..= 11 {
+			blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])))
+		}
+		blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[12])))
+	case _aes.ROUNDS_256:
+		#unroll for i in 10 ..= 13 {
+			blk = x86._mm_aesenc_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i])))
+		}
+		blk = x86._mm_aesenclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[14])))
+	}
+
+	intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), blk)
+}
+
+@(private, enable_target_feature = "sse2,aes")
+decrypt_block_hw :: proc(ctx: ^Context_Impl_Hardware, dst, src: []byte) {
+	blk := intrinsics.unaligned_load((^x86.__m128i)(raw_data(src)))
+
+	blk = x86._mm_xor_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[0])))
+	#unroll for i in 1 ..= 9 {
+		blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i])))
+	}
+	switch ctx._num_rounds {
+	case _aes.ROUNDS_128:
+		blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[10])))
+	case _aes.ROUNDS_192:
+		#unroll for i in 10 ..= 11 {
+			blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i])))
+		}
+		blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[12])))
+	case _aes.ROUNDS_256:
+		#unroll for i in 10 ..= 13 {
+			blk = x86._mm_aesdec_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[i])))
+		}
+		blk = x86._mm_aesdeclast_si128(blk, intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_dec[14])))
+	}
+
+	intrinsics.unaligned_store((^x86.__m128i)(raw_data(dst)), blk)
+}
diff --git a/core/crypto/aes/aes_gcm.odin b/core/crypto/aes/aes_gcm.odin
index 66ef48db2..25e0cc35b 100644
--- a/core/crypto/aes/aes_gcm.odin
+++ b/core/crypto/aes/aes_gcm.odin
@@ -1,13 +1,16 @@
 package aes
 
+import "core:bytes"
 import "core:crypto"
 import "core:crypto/_aes"
 import "core:crypto/_aes/ct64"
 import "core:encoding/endian"
 import "core:mem"
 
-// GCM_NONCE_SIZE is the size of the GCM nonce in bytes.
+// GCM_NONCE_SIZE is the default size of the GCM nonce in bytes.
 GCM_NONCE_SIZE :: 12
+// GCM_NONCE_SIZE_MAX is the maximum size of the GCM nonce in bytes.
+GCM_NONCE_SIZE_MAX :: 0x2000000000000000 // floor((2^64 - 1) / 8) bits
 // GCM_TAG_SIZE is the size of a GCM tag in bytes.
 GCM_TAG_SIZE :: _aes.GHASH_TAG_SIZE
 
@@ -39,6 +42,9 @@ seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, nonce, aad, plaintext: []byte) {
 	if len(dst) != len(plaintext) {
 		panic("crypto/aes: invalid destination ciphertext size")
 	}
+	if bytes.alias_inexactly(dst, plaintext) {
+		panic("crypto/aes: dst and plaintext alias inexactly")
+	}
 
 	if impl, is_hw := ctx._impl.(Context_Impl_Hardware); is_hw {
 		gcm_seal_hw(&impl, dst, tag, nonce, aad, plaintext)
@@ -47,17 +53,19 @@ seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, nonce, aad, plaintext: []byte) {
 
 	h: [_aes.GHASH_KEY_SIZE]byte
 	j0: [_aes.GHASH_BLOCK_SIZE]byte
+	j0_enc: [_aes.GHASH_BLOCK_SIZE]byte
 	s: [_aes.GHASH_TAG_SIZE]byte
-	init_ghash_ct64(ctx, &h, &j0, nonce)
+	init_ghash_ct64(ctx, &h, &j0, &j0_enc, nonce)
 
 	// Note: Our GHASH implementation handles appending padding.
 	ct64.ghash(s[:], h[:], aad)
-	gctr_ct64(ctx, dst, &s, plaintext, &h, nonce, true)
-	final_ghash_ct64(&s, &h, &j0, len(aad), len(plaintext))
+	gctr_ct64(ctx, dst, &s, plaintext, &h, &j0, true)
+	final_ghash_ct64(&s, &h, &j0_enc, len(aad), len(plaintext))
 	copy(tag, s[:])
 
 	mem.zero_explicit(&h, len(h))
 	mem.zero_explicit(&j0, len(j0))
+	mem.zero_explicit(&j0_enc, len(j0_enc))
 }
 
 // open_gcm authenticates the aad and ciphertext, and decrypts the ciphertext,
@@ -73,6 +81,9 @@ open_gcm :: proc(ctx: ^Context_GCM, dst, nonce, aad, ciphertext, tag: []byte) ->
 	if len(dst) != len(ciphertext) {
 		panic("crypto/aes: invalid destination plaintext size")
 	}
+	if bytes.alias_inexactly(dst, ciphertext) {
+		panic("crypto/aes: dst and ciphertext alias inexactly")
+	}
 
 	if impl, is_hw := ctx._impl.(Context_Impl_Hardware); is_hw {
 		return gcm_open_hw(&impl, dst, nonce, aad, ciphertext, tag)
@@ -80,12 +91,13 @@ open_gcm :: proc(ctx: ^Context_GCM, dst, nonce, aad, ciphertext, tag: []byte) ->
 
 	h: [_aes.GHASH_KEY_SIZE]byte
 	j0: [_aes.GHASH_BLOCK_SIZE]byte
+	j0_enc: [_aes.GHASH_BLOCK_SIZE]byte
 	s: [_aes.GHASH_TAG_SIZE]byte
-	init_ghash_ct64(ctx, &h, &j0, nonce)
+	init_ghash_ct64(ctx, &h, &j0, &j0_enc, nonce)
 
 	ct64.ghash(s[:], h[:], aad)
-	gctr_ct64(ctx, dst, &s, ciphertext, &h, nonce, false)
-	final_ghash_ct64(&s, &h, &j0, len(aad), len(ciphertext))
+	gctr_ct64(ctx, dst, &s, ciphertext, &h, &j0, false)
+	final_ghash_ct64(&s, &h, &j0_enc, len(aad), len(ciphertext))
 
 	ok := crypto.compare_constant_time(s[:], tag) == 1
 	if !ok {
@@ -94,6 +106,7 @@ open_gcm :: proc(ctx: ^Context_GCM, dst, nonce, aad, ciphertext, tag: []byte) ->
 
 	mem.zero_explicit(&h, len(h))
 	mem.zero_explicit(&j0, len(j0))
+	mem.zero_explicit(&j0_enc, len(j0_enc))
 	mem.zero_explicit(&s, len(s))
 
 	return ok
@@ -106,19 +119,14 @@ reset_gcm :: proc "contextless" (ctx: ^Context_GCM) {
 	ctx._is_initialized = false
 }
 
-@(private)
+@(private = "file")
 gcm_validate_common_slice_sizes :: proc(tag, nonce, aad, text: []byte) {
 	if len(tag) != GCM_TAG_SIZE {
 		panic("crypto/aes: invalid GCM tag size")
 	}
 
-	// The specification supports nonces in the range [1, 2^64) bits
-	// however per NIST SP 800-38D 5.2.1.1:
-	//
-	// > For IVs, it is recommended that implementations restrict support
-	// > to the length of 96 bits, to promote interoperability, efficiency,
-	// > and simplicity of design.
-	if len(nonce) != GCM_NONCE_SIZE {
+	// The specification supports nonces in the range [1, 2^64) bits.
+	if l := len(nonce); l == 0 || u64(l) >= GCM_NONCE_SIZE_MAX {
 		panic("crypto/aes: invalid GCM nonce size")
 	}
 
@@ -135,6 +143,7 @@ init_ghash_ct64 :: proc(
 	ctx: ^Context_GCM,
 	h: ^[_aes.GHASH_KEY_SIZE]byte,
 	j0: ^[_aes.GHASH_BLOCK_SIZE]byte,
+	j0_enc: ^[_aes.GHASH_BLOCK_SIZE]byte,
 	nonce: []byte,
 ) {
 	impl := &ctx._impl.(ct64.Context)
@@ -142,12 +151,25 @@ init_ghash_ct64 :: proc(
 	// 1. Let H = CIPH(k, 0^128)
 	ct64.encrypt_block(impl, h[:], h[:])
 
+	// Define a block, J0, as follows:
+	if l := len(nonce); l == GCM_NONCE_SIZE {
+		// if len(IV) = 96, then let J0 = IV || 0^31 || 1
+		copy(j0[:], nonce)
+		j0[_aes.GHASH_BLOCK_SIZE - 1] = 1
+	} else {
+		// If len(IV) != 96, then let s = 128 ceil(len(IV)/128) - len(IV),
+		// and let J0 = GHASHH(IV || 0^(s+64) || ceil(len(IV))^64).
+		ct64.ghash(j0[:], h[:], nonce)
+
+		tmp: [_aes.GHASH_BLOCK_SIZE]byte
+		endian.unchecked_put_u64be(tmp[8:], u64(l) * 8)
+		ct64.ghash(j0[:], h[:], tmp[:])
+	}
+
 	// ECB encrypt j0, so that we can just XOR with the tag.  In theory
 	// this could be processed along with the final GCTR block, to
 	// potentially save a call to AES-ECB, but... just use AES-NI.
-	copy(j0[:], nonce)
-	j0[_aes.GHASH_BLOCK_SIZE - 1] = 1
-	ct64.encrypt_block(impl, j0[:], j0[:])
+	ct64.encrypt_block(impl, j0_enc[:], j0[:])
 }
 
 @(private = "file")
@@ -175,33 +197,27 @@ gctr_ct64 :: proc(
 	s: ^[_aes.GHASH_BLOCK_SIZE]byte,
 	src: []byte,
 	h: ^[_aes.GHASH_KEY_SIZE]byte,
-	nonce: []byte,
+	nonce: ^[_aes.GHASH_BLOCK_SIZE]byte,
 	is_seal: bool,
-) {
+) #no_bounds_check {
 	ct64_inc_ctr32 := #force_inline proc "contextless" (dst: []byte, ctr: u32) -> u32 {
 		endian.unchecked_put_u32be(dst[12:], ctr)
 		return ctr + 1
 	}
 
-	// 2. Define a block J_0 as follows:
-	//    if len(IV) = 96, then let J0 = IV || 0^31 || 1
-	//
-	// Note: We only support 96 bit IVs.
+	// Setup the counter blocks.
 	tmp, tmp2: [ct64.STRIDE][BLOCK_SIZE]byte = ---, ---
 	ctrs, blks: [ct64.STRIDE][]byte = ---, ---
-	ctr: u32 = 2
+	ctr := endian.unchecked_get_u32be(nonce[GCM_NONCE_SIZE:]) + 1
 	for i in 0 ..< ct64.STRIDE {
 		// Setup scratch space for the keystream.
 		blks[i] = tmp2[i][:]
 
 		// Pre-copy the IV to all the counter blocks.
 		ctrs[i] = tmp[i][:]
-		copy(ctrs[i], nonce)
+		copy(ctrs[i], nonce[:GCM_NONCE_SIZE])
 	}
 
-	// We stitch the GCTR and GHASH operations together, so that only
-	// one pass over the ciphertext is required.
-
 	impl := &ctx._impl.(ct64.Context)
 	src, dst := src, dst
 
diff --git a/core/crypto/aes/aes_gcm_hw_intel.odin b/core/crypto/aes/aes_gcm_hw_intel.odin
new file mode 100644
index 000000000..7d32d4d96
--- /dev/null
+++ b/core/crypto/aes/aes_gcm_hw_intel.odin
@@ -0,0 +1,243 @@
+//+build amd64
+package aes
+
+import "base:intrinsics"
+import "core:crypto"
+import "core:crypto/_aes"
+import "core:crypto/_aes/hw_intel"
+import "core:encoding/endian"
+import "core:mem"
+import "core:simd/x86"
+
+@(private)
+gcm_seal_hw :: proc(ctx: ^Context_Impl_Hardware, dst, tag, nonce, aad, plaintext: []byte) {
+	h: [_aes.GHASH_KEY_SIZE]byte
+	j0: [_aes.GHASH_BLOCK_SIZE]byte
+	j0_enc: [_aes.GHASH_BLOCK_SIZE]byte
+	s: [_aes.GHASH_TAG_SIZE]byte
+	init_ghash_hw(ctx, &h, &j0, &j0_enc, nonce)
+
+	// Note: Our GHASH implementation handles appending padding.
+	hw_intel.ghash(s[:], h[:], aad)
+	gctr_hw(ctx, dst, &s, plaintext, &h, &j0, true)
+	final_ghash_hw(&s, &h, &j0_enc, len(aad), len(plaintext))
+	copy(tag, s[:])
+
+	mem.zero_explicit(&h, len(h))
+	mem.zero_explicit(&j0, len(j0))
+	mem.zero_explicit(&j0_enc, len(j0_enc))
+}
+
+@(private)
+gcm_open_hw :: proc(ctx: ^Context_Impl_Hardware, dst, nonce, aad, ciphertext, tag: []byte) -> bool {
+	h: [_aes.GHASH_KEY_SIZE]byte
+	j0: [_aes.GHASH_BLOCK_SIZE]byte
+	j0_enc: [_aes.GHASH_BLOCK_SIZE]byte
+	s: [_aes.GHASH_TAG_SIZE]byte
+	init_ghash_hw(ctx, &h, &j0, &j0_enc, nonce)
+
+	hw_intel.ghash(s[:], h[:], aad)
+	gctr_hw(ctx, dst, &s, ciphertext, &h, &j0, false)
+	final_ghash_hw(&s, &h, &j0_enc, len(aad), len(ciphertext))
+
+	ok := crypto.compare_constant_time(s[:], tag) == 1
+	if !ok {
+		mem.zero_explicit(raw_data(dst), len(dst))
+	}
+
+	mem.zero_explicit(&h, len(h))
+	mem.zero_explicit(&j0, len(j0))
+	mem.zero_explicit(&j0_enc, len(j0_enc))
+	mem.zero_explicit(&s, len(s))
+
+	return ok
+}
+
+@(private = "file")
+init_ghash_hw :: proc(
+	ctx: ^Context_Impl_Hardware,
+	h: ^[_aes.GHASH_KEY_SIZE]byte,
+	j0: ^[_aes.GHASH_BLOCK_SIZE]byte,
+	j0_enc: ^[_aes.GHASH_BLOCK_SIZE]byte,
+	nonce: []byte,
+) {
+	// 1. Let H = CIPH(k, 0^128)
+	encrypt_block_hw(ctx, h[:], h[:])
+
+	// Define a block, J0, as follows:
+	if l := len(nonce); l == GCM_NONCE_SIZE {
+		// if len(IV) = 96, then let J0 = IV || 0^31 || 1
+		copy(j0[:], nonce)
+		j0[_aes.GHASH_BLOCK_SIZE - 1] = 1
+	} else {
+		// If len(IV) != 96, then let s = 128 ceil(len(IV)/128) - len(IV),
+		// and let J0 = GHASHH(IV || 0^(s+64) || ceil(len(IV))^64).
+		hw_intel.ghash(j0[:], h[:], nonce)
+
+		tmp: [_aes.GHASH_BLOCK_SIZE]byte
+		endian.unchecked_put_u64be(tmp[8:], u64(l) * 8)
+		hw_intel.ghash(j0[:], h[:], tmp[:])
+	}
+
+	// ECB encrypt j0, so that we can just XOR with the tag.
+	encrypt_block_hw(ctx, j0_enc[:], j0[:])
+}
+
+@(private = "file", enable_target_feature = "sse2")
+final_ghash_hw :: proc(
+	s: ^[_aes.GHASH_BLOCK_SIZE]byte,
+	h: ^[_aes.GHASH_KEY_SIZE]byte,
+	j0: ^[_aes.GHASH_BLOCK_SIZE]byte,
+	a_len: int,
+	t_len: int,
+) {
+	blk: [_aes.GHASH_BLOCK_SIZE]byte
+	endian.unchecked_put_u64be(blk[0:], u64(a_len) * 8)
+	endian.unchecked_put_u64be(blk[8:], u64(t_len) * 8)
+
+	hw_intel.ghash(s[:], h[:], blk[:])
+	j0_vec := intrinsics.unaligned_load((^x86.__m128i)(j0))
+	s_vec := intrinsics.unaligned_load((^x86.__m128i)(s))
+	s_vec = x86._mm_xor_si128(s_vec, j0_vec)
+	intrinsics.unaligned_store((^x86.__m128i)(s), s_vec)
+}
+
+@(private = "file", enable_target_feature = "sse2,sse4.1,aes")
+gctr_hw :: proc(
+	ctx: ^Context_Impl_Hardware,
+	dst: []byte,
+	s: ^[_aes.GHASH_BLOCK_SIZE]byte,
+	src: []byte,
+	h: ^[_aes.GHASH_KEY_SIZE]byte,
+	nonce: ^[_aes.GHASH_BLOCK_SIZE]byte,
+	is_seal: bool,
+) #no_bounds_check {
+	sks: [15]x86.__m128i = ---
+	for i in 0 ..= ctx._num_rounds {
+		sks[i] = intrinsics.unaligned_load((^x86.__m128i)(&ctx._sk_exp_enc[i]))
+	}
+
+	// Setup the counter block
+	ctr_blk := intrinsics.unaligned_load((^x86.__m128i)(nonce))
+	ctr := endian.unchecked_get_u32be(nonce[GCM_NONCE_SIZE:]) + 1
+
+	src, dst := src, dst
+
+	// Note: Instead of doing GHASH and CTR separately, it is more
+	// performant to interleave (stitch) the two operations together.
+	// This results in an unreadable mess, so we opt for simplicity
+	// as performance is adequate.
+
+	blks: [CTR_STRIDE_HW]x86.__m128i = ---
+	nr_blocks := len(src) / BLOCK_SIZE
+	for nr_blocks >= CTR_STRIDE_HW {
+		if !is_seal {
+			hw_intel.ghash(s[:], h[:], src[:CTR_STRIDE_BYTES_HW])
+		}
+
+		#unroll for i in 0 ..< CTR_STRIDE_HW {
+			blks[i], ctr = hw_inc_ctr32(&ctr_blk, ctr)
+		}
+
+		#unroll for i in 0 ..< CTR_STRIDE_HW {
+			blks[i] = x86._mm_xor_si128(blks[i], sks[0])
+		}
+		#unroll for i in 1 ..= 9 {
+			#unroll for j in 0 ..< CTR_STRIDE_HW {
+				blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
+			}
+		}
+		switch ctx._num_rounds {
+		case _aes.ROUNDS_128:
+			#unroll for i in 0 ..< CTR_STRIDE_HW {
+				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[10])
+			}
+		case _aes.ROUNDS_192:
+			#unroll for i in 10 ..= 11 {
+				#unroll for j in 0 ..< CTR_STRIDE_HW {
+					blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
+				}
+			}
+			#unroll for i in 0 ..< CTR_STRIDE_HW {
+				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[12])
+			}
+		case _aes.ROUNDS_256:
+			#unroll for i in 10 ..= 13 {
+				#unroll for j in 0 ..< CTR_STRIDE_HW {
+					blks[j] = x86._mm_aesenc_si128(blks[j], sks[i])
+				}
+			}
+			#unroll for i in 0 ..< CTR_STRIDE_HW {
+				blks[i] = x86._mm_aesenclast_si128(blks[i], sks[14])
+			}
+		}
+
+		xor_blocks_hw(dst, src, blks[:])
+
+		if is_seal {
+			hw_intel.ghash(s[:], h[:], dst[:CTR_STRIDE_BYTES_HW])
+		}
+
+		src = src[CTR_STRIDE_BYTES_HW:]
+		dst = dst[CTR_STRIDE_BYTES_HW:]
+		nr_blocks -= CTR_STRIDE_HW
+	}
+
+	// Handle the remainder.
+	for n := len(src); n > 0; {
+		l := min(n, BLOCK_SIZE)
+		if !is_seal {
+			hw_intel.ghash(s[:], h[:], src[:l])
+		}
+
+		blks[0], ctr = hw_inc_ctr32(&ctr_blk, ctr)
+
+		blks[0] = x86._mm_xor_si128(blks[0], sks[0])
+		#unroll for i in 1 ..= 9 {
+			blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
+		}
+		switch ctx._num_rounds {
+		case _aes.ROUNDS_128:
+			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[10])
+		case _aes.ROUNDS_192:
+			#unroll for i in 10 ..= 11 {
+				blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
+			}
+			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[12])
+		case _aes.ROUNDS_256:
+			#unroll for i in 10 ..= 13 {
+				blks[0] = x86._mm_aesenc_si128(blks[0], sks[i])
+			}
+			blks[0] = x86._mm_aesenclast_si128(blks[0], sks[14])
+		}
+
+		if l == BLOCK_SIZE {
+			xor_blocks_hw(dst, src, blks[:1])
+		} else {
+			blk: [BLOCK_SIZE]byte
+			copy(blk[:], src)
+			xor_blocks_hw(blk[:], blk[:], blks[:1])
+			copy(dst, blk[:l])
+		}
+		if is_seal {
+			hw_intel.ghash(s[:], h[:], dst[:l])
+		}
+
+		dst = dst[l:]
+		src = src[l:]
+		n -= l
+	}
+
+	mem.zero_explicit(&blks, size_of(blks))
+	mem.zero_explicit(&sks, size_of(sks))
+}
+
+// BUG: Sticking this in gctr_hw (like the other implementations) crashes
+// the compiler.
+//
+// src/check_expr.cpp(7892): Assertion Failure: `c->curr_proc_decl->entity`
+@(private = "file", enable_target_feature = "sse4.1")
+hw_inc_ctr32 :: #force_inline proc "contextless" (src: ^x86.__m128i, ctr: u32) -> (x86.__m128i, u32) {
+	ret := x86._mm_insert_epi32(src^, i32(intrinsics.byte_swap(ctr)), 3)
+	return ret, ctr + 1
+}
diff --git a/core/crypto/aes/aes_impl_hw_gen.odin b/core/crypto/aes/aes_impl_hw_gen.odin
index 94815f61c..5361c6ef0 100644
--- a/core/crypto/aes/aes_impl_hw_gen.odin
+++ b/core/crypto/aes/aes_impl_hw_gen.odin
@@ -1,3 +1,4 @@
+//+build !amd64
 package aes
 
 @(private = "file")
diff --git a/core/crypto/aes/aes_impl_hw_intel.odin b/core/crypto/aes/aes_impl_hw_intel.odin
new file mode 100644
index 000000000..39ea2dc8d
--- /dev/null
+++ b/core/crypto/aes/aes_impl_hw_intel.odin
@@ -0,0 +1,18 @@
+//+build amd64
+package aes
+
+import "core:crypto/_aes/hw_intel"
+
+// is_hardware_accelerated returns true iff hardware accelerated AES
+// is supported.
+is_hardware_accelerated :: proc "contextless" () -> bool {
+	return hw_intel.is_supported()
+}
+
+@(private)
+Context_Impl_Hardware :: hw_intel.Context
+
+@(private, enable_target_feature = "sse2,aes")
+init_impl_hw :: proc(ctx: ^Context_Impl_Hardware, key: []byte) {
+	hw_intel.init(ctx, key)
+}
diff --git a/core/crypto/chacha20/chacha20.odin b/core/crypto/chacha20/chacha20.odin
index 7f0950d03..73d3e1ea2 100644
--- a/core/crypto/chacha20/chacha20.odin
+++ b/core/crypto/chacha20/chacha20.odin
@@ -7,6 +7,7 @@ See:
 */
 package chacha20
 
+import "core:bytes"
 import "core:encoding/endian"
 import "core:math/bits"
 import "core:mem"
@@ -121,14 +122,15 @@ seek :: proc(ctx: ^Context, block_nr: u64) {
 xor_bytes :: proc(ctx: ^Context, dst, src: []byte) {
 	assert(ctx._is_initialized)
 
-	// TODO: Enforcing that dst and src alias exactly or not at all
-	// is a good idea, though odd aliasing should be extremely uncommon.
-
 	src, dst := src, dst
 	if dst_len := len(dst); dst_len < len(src) {
 		src = src[:dst_len]
 	}
 
+	if bytes.alias_inexactly(dst, src) {
+		panic("crypto/chacha20: dst and src alias inexactly")
+	}
+
 	for remaining := len(src); remaining > 0; {
 		// Process multiple blocks at once
 		if ctx._off == _BLOCK_SIZE {
diff --git a/core/crypto/crypto.odin b/core/crypto/crypto.odin
index f83d20dd7..323cc45d6 100644
--- a/core/crypto/crypto.odin
+++ b/core/crypto/crypto.odin
@@ -60,7 +60,11 @@ rand_bytes :: proc (dst: []byte) {
 	_rand_bytes(dst)
 }
 
-
+// random_generator returns a `runtime.Random_Generator` backed by the
+// system entropy source.
+//
+// Support for the system entropy source can be checked with the
+// `HAS_RAND_BYTES` boolean constant.
 random_generator :: proc() -> runtime.Random_Generator {
 	return {
 		procedure = proc(data: rawptr, mode: runtime.Random_Generator_Mode, p: []byte) {
diff --git a/core/encoding/cbor/marshal.odin b/core/encoding/cbor/marshal.odin
index 2cdf384c3..6657807f5 100644
--- a/core/encoding/cbor/marshal.odin
+++ b/core/encoding/cbor/marshal.odin
@@ -351,7 +351,8 @@ _marshal_into_encoder :: proc(e: Encoder, v: any, ti: ^runtime.Type_Info) -> (er
 				builder := strings.builder_from_slice(res[:])
 				e.writer = strings.to_stream(&builder)
 
-				assert(_encode_u64(e, u64(len(str)), .Text) == nil)
+				err := _encode_u64(e, u64(len(str)), .Text)
+				assert(err == nil)
 				res[9] = u8(len(builder.buf))
 				assert(res[9] < 10)
 				return
@@ -506,7 +507,7 @@ _marshal_into_encoder :: proc(e: Encoder, v: any, ti: ^runtime.Type_Info) -> (er
 		}
 		
 		n: u64; {
-			for _, i in info.names {
+			for _, i in info.names[:info.field_count] {
 				if field_name(info, i) != "-" {
 					n += 1
 				}
@@ -522,7 +523,7 @@ _marshal_into_encoder :: proc(e: Encoder, v: any, ti: ^runtime.Type_Info) -> (er
 			entries := make([dynamic]Name, 0, n, e.temp_allocator) or_return
 			defer delete(entries)
 
-			for _, i in info.names {
+			for _, i in info.names[:info.field_count] {
 				fname := field_name(info, i)
 				if fname == "-" {
 					continue
@@ -540,7 +541,7 @@ _marshal_into_encoder :: proc(e: Encoder, v: any, ti: ^runtime.Type_Info) -> (er
 				marshal_entry(e, info, v, entry.name, entry.field) or_return
 			}
 		} else {
-			for _, i in info.names {
+			for _, i in info.names[:info.field_count] {
 				fname := field_name(info, i)
 				if fname == "-" {
 					continue
diff --git a/core/encoding/cbor/unmarshal.odin b/core/encoding/cbor/unmarshal.odin
index 13350bb85..c54660839 100644
--- a/core/encoding/cbor/unmarshal.odin
+++ b/core/encoding/cbor/unmarshal.odin
@@ -96,7 +96,8 @@ _unmarshal_value :: proc(d: Decoder, v: any, hdr: Header, allocator := context.a
 			ti = reflect.type_info_base(variant)
 			if !reflect.is_pointer_internally(variant) {
 				tag := any{rawptr(uintptr(v.data) + u.tag_offset), u.tag_type.id}
-				assert(_assign_int(tag, 1))
+				assigned := _assign_int(tag, 1)
+				assert(assigned)
 			}
 		}
 	}
@@ -618,7 +619,7 @@ _unmarshal_map :: proc(d: Decoder, v: any, ti: ^reflect.Type_Info, hdr: Header,
 
 	#partial switch t in ti.variant {
 	case reflect.Type_Info_Struct:
-		if t.is_raw_union {
+		if .raw_union in t.flags {
 			return _unsupported(v, hdr)
 		}
 
diff --git a/core/encoding/ini/ini.odin b/core/encoding/ini/ini.odin
index eb0ad9e7c..2bb7996a3 100644
--- a/core/encoding/ini/ini.odin
+++ b/core/encoding/ini/ini.odin
@@ -82,15 +82,17 @@ Map :: distinct map[string]map[string]string
 
 load_map_from_string :: proc(src: string, allocator: runtime.Allocator, options := DEFAULT_OPTIONS) -> (m: Map, err: runtime.Allocator_Error) {
 	unquote :: proc(val: string) -> (string, runtime.Allocator_Error) {
-		v, allocated, ok := strconv.unquote_string(val)
-		if !ok {
-			return strings.clone(val)
+		if len(val) > 0 && (val[0] == '"' || val[0] == '\'') {
+			v, allocated, ok := strconv.unquote_string(val)
+			if !ok {
+				return strings.clone(val)
+			}
+			if allocated {
+				return v, nil
+			}
+			return strings.clone(v), nil
 		}
-		if allocated {
-			return v, nil
-		}
-		return strings.clone(v)
-
+		return strings.clone(val)
 	}
 
 	context.allocator = allocator
@@ -121,7 +123,7 @@ load_map_from_path :: proc(path: string, allocator: runtime.Allocator, options :
 	data := os.read_entire_file(path, allocator) or_return
 	defer delete(data, allocator)
 	m, err = load_map_from_string(string(data), allocator, options)
-	ok = err != nil
+	ok = err == nil
 	defer if !ok {
 		delete_map(m)
 	}
@@ -142,6 +144,7 @@ delete_map :: proc(m: Map) {
 			delete(value, allocator)
 		}
 		delete(section)
+		delete(pairs)
 	}
 	delete(m)
 }
diff --git a/core/encoding/json/marshal.odin b/core/encoding/json/marshal.odin
index 0464c24d1..009bf7ade 100644
--- a/core/encoding/json/marshal.odin
+++ b/core/encoding/json/marshal.odin
@@ -100,38 +100,7 @@ marshal_to_writer :: proc(w: io.Writer, v: any, opt: ^Marshal_Options) -> (err:
 
 	case runtime.Type_Info_Integer:
 		buf: [40]byte
-		u: u128
-		switch i in a {
-		case i8:      u = u128(i)
-		case i16:     u = u128(i)
-		case i32:     u = u128(i)
-		case i64:     u = u128(i)
-		case i128:    u = u128(i)
-		case int:     u = u128(i)
-		case u8:      u = u128(i)
-		case u16:     u = u128(i)
-		case u32:     u = u128(i)
-		case u64:     u = u128(i)
-		case u128:    u = u128(i)
-		case uint:    u = u128(i)
-		case uintptr: u = u128(i)
-
-		case i16le:  u = u128(i)
-		case i32le:  u = u128(i)
-		case i64le:  u = u128(i)
-		case u16le:  u = u128(i)
-		case u32le:  u = u128(i)
-		case u64le:  u = u128(i)
-		case u128le: u = u128(i)
-
-		case i16be:  u = u128(i)
-		case i32be:  u = u128(i)
-		case i64be:  u = u128(i)
-		case u16be:  u = u128(i)
-		case u32be:  u = u128(i)
-		case u64be:  u = u128(i)
-		case u128be: u = u128(i)
-		}
+		u := cast_any_int_to_u128(a)
 
 		s: string
 
@@ -310,7 +279,12 @@ marshal_to_writer :: proc(w: io.Writer, v: any, opt: ^Marshal_Options) -> (err:
 							case cstring: name = string(s)
 							}
 							opt_write_key(w, opt, name) or_return
-
+						case runtime.Type_Info_Integer:
+							buf: [40]byte
+							u := cast_any_int_to_u128(ka)
+							name = strconv.append_bits_128(buf[:], u, 10, info.signed, 8*kti.size, "0123456789", nil)
+							
+							opt_write_key(w, opt, name) or_return
 						case: return .Unsupported_Type
 						}
 					}
@@ -406,10 +380,15 @@ marshal_to_writer :: proc(w: io.Writer, v: any, opt: ^Marshal_Options) -> (err:
 			ti := runtime.type_info_base(type_info_of(v.id))
 			info := ti.variant.(runtime.Type_Info_Struct)
 			first_iteration := true
-			for name, i in info.names {
+			for name, i in info.names[:info.field_count] {
 				omitempty := false
 
 				json_name, extra := json_name_from_tag_value(reflect.struct_tag_get(reflect.Struct_Tag(info.tags[i]), "json"))
+
+				if json_name == "-" {
+					continue
+				}
+
 				for flag in strings.split_iterator(&extra, ",") {
 					switch flag {
 					case "omitempty":
@@ -657,3 +636,41 @@ opt_write_indentation :: proc(w: io.Writer, opt: ^Marshal_Options) -> (err: io.E
 
 	return
 }
+
+@(private)
+cast_any_int_to_u128 :: proc(any_int_value: any) -> u128 {
+	u: u128 = 0
+	switch i in any_int_value {
+	case i8:      u = u128(i)
+	case i16:     u = u128(i)
+	case i32:     u = u128(i)
+	case i64:     u = u128(i)
+	case i128:    u = u128(i)
+	case int:     u = u128(i)
+	case u8:      u = u128(i)
+	case u16:     u = u128(i)
+	case u32:     u = u128(i)
+	case u64:     u = u128(i)
+	case u128:    u = u128(i)
+	case uint:    u = u128(i)
+	case uintptr: u = u128(i)
+
+	case i16le:  u = u128(i)
+	case i32le:  u = u128(i)
+	case i64le:  u = u128(i)
+	case u16le:  u = u128(i)
+	case u32le:  u = u128(i)
+	case u64le:  u = u128(i)
+	case u128le: u = u128(i)
+
+	case i16be:  u = u128(i)
+	case i32be:  u = u128(i)
+	case i64be:  u = u128(i)
+	case u16be:  u = u128(i)
+	case u32be:  u = u128(i)
+	case u64be:  u = u128(i)
+	case u128be: u = u128(i)
+	}
+
+	return u
+}
\ No newline at end of file
diff --git a/core/encoding/json/unmarshal.odin b/core/encoding/json/unmarshal.odin
index eb59e7838..127bce650 100644
--- a/core/encoding/json/unmarshal.odin
+++ b/core/encoding/json/unmarshal.odin
@@ -363,12 +363,11 @@ unmarshal_object :: proc(p: ^Parser, v: any, end_token: Token_Kind) -> (err: Unm
 	}
 
 	v := v
-	v = reflect.any_base(v)
-	ti := type_info_of(v.id)
+	ti := reflect.type_info_base(type_info_of(v.id))
 	
 	#partial switch t in ti.variant {
 	case reflect.Type_Info_Struct:
-		if t.is_raw_union {
+		if .raw_union in t.flags {
 			return UNSUPPORTED_TYPE
 		}
 	
@@ -475,7 +474,7 @@ unmarshal_object :: proc(p: ^Parser, v: any, end_token: Token_Kind) -> (err: Unm
 		}
 		
 	case reflect.Type_Info_Map:
-		if !reflect.is_string(t.key) {
+		if !reflect.is_string(t.key) && !reflect.is_integer(t.key) {
 			return UNSUPPORTED_TYPE
 		}
 		raw_map := (^mem.Raw_Map)(v.data)
@@ -492,25 +491,39 @@ unmarshal_object :: proc(p: ^Parser, v: any, end_token: Token_Kind) -> (err: Unm
 			key, _ := parse_object_key(p, p.allocator)
 			unmarshal_expect_token(p, .Colon)
 			
-			
+
 			mem.zero_slice(elem_backing)
 			if uerr := unmarshal_value(p, map_backing_value); uerr != nil {
 				delete(key, p.allocator)
 				return uerr
 			}
 
-			key_ptr := rawptr(&key)
+			key_ptr: rawptr
 
-			key_cstr: cstring
-			if reflect.is_cstring(t.key) {
-				key_cstr = cstring(raw_data(key))
-				key_ptr = &key_cstr
+			#partial switch tk in t.key.variant {
+				case runtime.Type_Info_String:			
+					key_ptr = rawptr(&key)
+					key_cstr: cstring
+					if reflect.is_cstring(t.key) {
+						key_cstr = cstring(raw_data(key))
+						key_ptr = &key_cstr
+					}
+				case runtime.Type_Info_Integer:
+					i, ok := strconv.parse_i128(key)
+					if !ok	{ return UNSUPPORTED_TYPE }
+					key_ptr = rawptr(&i)
+				case: return UNSUPPORTED_TYPE
 			}
-			
+
 			set_ptr := runtime.__dynamic_map_set_without_hash(raw_map, t.map_info, key_ptr, map_backing_value.data)
 			if set_ptr == nil {
 				delete(key, p.allocator)
 			} 
+
+			// there's no need to keep string value on the heap, since it was copied into map 
+			if reflect.is_integer(t.key) {
+				delete(key, p.allocator)
+			}
 			
 			if parse_comma(p) {
 				break map_loop
diff --git a/core/fmt/fmt.odin b/core/fmt/fmt.odin
index 234f4afbd..7b86fd1b7 100644
--- a/core/fmt/fmt.odin
+++ b/core/fmt/fmt.odin
@@ -334,6 +334,27 @@ panicf :: proc(fmt: string, args: ..any, loc := #caller_location) -> ! {
 	message := tprintf(fmt, ..args)
 	p("Panic", message, loc)
 }
+
+// 	Creates a formatted C string
+//
+// 	*Allocates Using Context's Allocator*
+//
+// 	Inputs:
+// 	- args: A variadic list of arguments to be formatted.
+// 	- sep: An optional separator string (default is a single space).
+//
+// 	Returns: A formatted C string.
+//
+@(require_results)
+caprint :: proc(args: ..any, sep := " ", allocator := context.allocator) -> cstring {
+	str: strings.Builder
+	strings.builder_init(&str, allocator)
+	sbprint(&str, ..args, sep=sep)
+	strings.write_byte(&str, 0)
+	s := strings.to_string(str)
+	return cstring(raw_data(s))
+}
+
 // Creates a formatted C string
 //
 // *Allocates Using Context's Allocator*
@@ -346,9 +367,9 @@ panicf :: proc(fmt: string, args: ..any, loc := #caller_location) -> ! {
 // Returns: A formatted C string
 //
 @(require_results)
-caprintf :: proc(format: string, args: ..any, newline := false) -> cstring {
+caprintf :: proc(format: string, args: ..any, allocator := context.allocator, newline := false) -> cstring {
 	str: strings.Builder
-	strings.builder_init(&str)
+	strings.builder_init(&str, allocator)
 	sbprintf(&str, format, ..args, newline=newline)
 	strings.write_byte(&str, 0)
 	s := strings.to_string(str)
@@ -365,8 +386,8 @@ caprintf :: proc(format: string, args: ..any, newline := false) -> cstring {
 // Returns: A formatted C string
 //
 @(require_results)
-caprintfln :: proc(format: string, args: ..any) -> cstring {
-	return caprintf(format, ..args, newline=true)
+caprintfln :: proc(format: string, args: ..any, allocator := context.allocator) -> cstring {
+	return caprintf(format, ..args, allocator=allocator, newline=true)
 }
 // 	Creates a formatted C string
 //
@@ -380,12 +401,7 @@ caprintfln :: proc(format: string, args: ..any) -> cstring {
 //
 @(require_results)
 ctprint :: proc(args: ..any, sep := " ") -> cstring {
-	str: strings.Builder
-	strings.builder_init(&str, context.temp_allocator)
-	sbprint(&str, ..args, sep=sep)
-	strings.write_byte(&str, 0)
-	s := strings.to_string(str)
-	return cstring(raw_data(s))
+	return caprint(args=args, sep=sep, allocator=context.temp_allocator)
 }
 // Creates a formatted C string
 //
@@ -400,12 +416,7 @@ ctprint :: proc(args: ..any, sep := " ") -> cstring {
 //
 @(require_results)
 ctprintf :: proc(format: string, args: ..any, newline := false) -> cstring {
-	str: strings.Builder
-	strings.builder_init(&str, context.temp_allocator)
-	sbprintf(&str, format, ..args, newline=newline)
-	strings.write_byte(&str, 0)
-	s := strings.to_string(str)
-	return cstring(raw_data(s))
+	return caprintf(format=format, args=args, allocator=context.temp_allocator, newline=newline)
 }
 // Creates a formatted C string, followed by a newline.
 //
@@ -419,7 +430,7 @@ ctprintf :: proc(format: string, args: ..any, newline := false) -> cstring {
 //
 @(require_results)
 ctprintfln :: proc(format: string, args: ..any) -> cstring {
-	return ctprintf(format, ..args, newline=true)
+	return caprintf(format=format, args=args, allocator=context.temp_allocator, newline=true)
 }
 // Formats using the default print settings and writes to the given strings.Builder
 //
@@ -1861,7 +1872,7 @@ handle_tag :: proc(state: ^Info_State, data: rawptr, info: reflect.Type_Info_Str
 		if optional_len == nil {
 			return
 		}
-		for f, i in info.names {
+		for f, i in info.names[:info.field_count] {
 			if f != field_name {
 				continue
 			}
@@ -1965,7 +1976,7 @@ fmt_struct :: proc(fi: ^Info, v: any, the_verb: rune, info: runtime.Type_Info_St
 		fmt_bad_verb(fi, the_verb)
 		return
 	}
-	if info.is_raw_union {
+	if .raw_union in info.flags {
 		if type_name == "" {
 			io.write_string(fi.writer, "(raw union)", &fi.n)
 		} else {
@@ -1989,7 +2000,7 @@ fmt_struct :: proc(fi: ^Info, v: any, the_verb: rune, info: runtime.Type_Info_St
 	// fi.hash = false;
 	fi.indent += 1
 
-	is_empty := len(info.names) == 0
+	is_empty := info.field_count == 0
 
 	if !is_soa && hash && !is_empty {
 		io.write_byte(fi.writer, '\n', &fi.n)
@@ -2010,17 +2021,17 @@ fmt_struct :: proc(fi: ^Info, v: any, the_verb: rune, info: runtime.Type_Info_St
 			base_type_name = v.name
 		}
 
-		actual_field_count := len(info.names)
+		actual_field_count := info.field_count
 
 		n := uintptr(info.soa_len)
 
 		if info.soa_kind == .Slice {
-			actual_field_count = len(info.names)-1 // len
+			actual_field_count = info.field_count-1 // len
 
 			n = uintptr((^int)(uintptr(v.data) + info.offsets[actual_field_count])^)
 
 		} else if info.soa_kind == .Dynamic {
-			actual_field_count = len(info.names)-3 // len, cap, allocator
+			actual_field_count = info.field_count-3 // len, cap, allocator
 
 			n = uintptr((^int)(uintptr(v.data) + info.offsets[actual_field_count])^)
 		}
@@ -2099,7 +2110,7 @@ fmt_struct :: proc(fi: ^Info, v: any, the_verb: rune, info: runtime.Type_Info_St
 		}
 	} else {
 		field_count := -1
-		for name, i in info.names {
+		for name, i in info.names[:info.field_count] {
 			optional_len: int = -1
 			use_nul_termination: bool = false
 			verb := the_verb if the_verb == 'w' else 'v'
@@ -2605,7 +2616,7 @@ fmt_bit_field :: proc(fi: ^Info, v: any, verb: rune, info: runtime.Type_Info_Bit
 
 
 	field_count := -1
-	for name, i in info.names {
+	for name, i in info.names[:info.field_count] {
 		field_verb := verb
 		if handle_bit_field_tag(v.data, info, i, &field_verb) {
 			continue
@@ -2751,9 +2762,11 @@ fmt_value :: proc(fi: ^Info, v: any, verb: rune) {
 			elem := runtime.type_info_base(info.elem)
 			if elem != nil {
 				if n, ok := fi.optional_len.?; ok {
+					fi.optional_len = nil
 					fmt_array(fi, ptr, n, elem.size, elem, verb)
 					return
 				} else if fi.use_nul_termination {
+					fi.use_nul_termination = false
 					fmt_array_nul_terminated(fi, ptr, -1, elem.size, elem, verb)
 					return
 				}
@@ -2855,8 +2868,10 @@ fmt_value :: proc(fi: ^Info, v: any, verb: rune) {
 		n := info.count
 		ptr := v.data
 		if ol, ok := fi.optional_len.?; ok {
+			fi.optional_len = nil
 			n = min(n, ol)
 		} else if fi.use_nul_termination {
+			fi.use_nul_termination = false
 			fmt_array_nul_terminated(fi, ptr, n, info.elem_size, info.elem, verb)
 			return
 		}
@@ -2867,8 +2882,10 @@ fmt_value :: proc(fi: ^Info, v: any, verb: rune) {
 		n := slice.len
 		ptr := slice.data
 		if ol, ok := fi.optional_len.?; ok {
+			fi.optional_len = nil
 			n = min(n, ol)
 		} else if fi.use_nul_termination {
+			fi.use_nul_termination = false
 			fmt_array_nul_terminated(fi, ptr, n, info.elem_size, info.elem, verb)
 			return
 		}
@@ -2879,8 +2896,10 @@ fmt_value :: proc(fi: ^Info, v: any, verb: rune) {
 		n := array.len
 		ptr := array.data
 		if ol, ok := fi.optional_len.?; ok {
+			fi.optional_len = nil
 			n = min(n, ol)
 		} else if fi.use_nul_termination {
+			fi.use_nul_termination = false
 			fmt_array_nul_terminated(fi, ptr, n, info.elem_size, info.elem, verb)
 			return
 		}
diff --git a/core/math/cmplx/cmplx.odin b/core/math/cmplx/cmplx.odin
index 4625f83c6..d1c70ca61 100644
--- a/core/math/cmplx/cmplx.odin
+++ b/core/math/cmplx/cmplx.odin
@@ -229,7 +229,7 @@ sqrt_complex128 :: proc "contextless" (x: complex128) -> complex128 {
 }
 
 ln_complex32 :: proc "contextless" (x: complex32) -> complex32 {
-	return complex(math.ln(abs(x)), phase(x))
+	return complex32(ln_complex64(complex64(x)))
 }
 ln_complex64 :: proc "contextless" (x: complex64) -> complex64 {
 	return complex(math.ln(abs(x)), phase(x))
@@ -240,26 +240,7 @@ ln_complex128 :: proc "contextless" (x: complex128) -> complex128 {
 
 
 exp_complex32 :: proc "contextless" (x: complex32) -> complex32 {
-	switch re, im := real(x), imag(x); {
-	case math.is_inf(re, 0):
-		switch {
-		case re > 0 && im == 0:
-			return x
-		case math.is_inf(im, 0) || math.is_nan(im):
-			if re < 0 {
-				return complex(0, math.copy_sign(0, im))
-			} else {
-				return complex(math.inf_f64(1.0), math.nan_f64())
-			}
-		}
-	case math.is_nan(re):
-		if im == 0 {
-			return complex(math.nan_f16(), im)
-		}
-	}
-	r := math.exp(real(x))
-	s, c := math.sincos(imag(x))
-	return complex(r*c, r*s)
+	return complex32(exp_complex64(complex64(x)))
 }
 exp_complex64 :: proc "contextless" (x: complex64) -> complex64 {
 	switch re, im := real(x), imag(x); {
@@ -308,37 +289,7 @@ exp_complex128 :: proc "contextless" (x: complex128) -> complex128 {
 
 
 pow_complex32 :: proc "contextless" (x, y: complex32) -> complex32 {
-	if x == 0 { // Guaranteed also true for x == -0.
-		if is_nan(y) {
-			return nan_complex32()
-		}
-		r, i := real(y), imag(y)
-		switch {
-		case r == 0:
-			return 1
-		case r < 0:
-			if i == 0 {
-				return complex(math.inf_f16(1), 0)
-			}
-			return inf_complex32()
-		case r > 0:
-			return 0
-		}
-		unreachable()
-	}
-	modulus := abs(x)
-	if modulus == 0 {
-		return complex(0, 0)
-	}
-	r := math.pow(modulus, real(y))
-	arg := phase(x)
-	theta := real(y) * arg
-	if imag(y) != 0 {
-		r *= math.exp(-imag(y) * arg)
-		theta += imag(y) * math.ln(modulus)
-	}
-	s, c := math.sincos(theta)
-	return complex(r*c, r*s)
+	return complex32(pow_complex64(complex64(x), complex64(y)))
 }
 pow_complex64 :: proc "contextless" (x, y: complex64) -> complex64 {
 	if x == 0 { // Guaranteed also true for x == -0.
@@ -410,7 +361,7 @@ pow_complex128 :: proc "contextless" (x, y: complex128) -> complex128 {
 
 
 log10_complex32 :: proc "contextless" (x: complex32) -> complex32 {
-	return math.LN10*ln(x)
+	return complex32(log10_complex64(complex64(x)))
 }
 log10_complex64 :: proc "contextless" (x: complex64) -> complex64 {
 	return math.LN10*ln(x)
@@ -421,7 +372,7 @@ log10_complex128 :: proc "contextless" (x: complex128) -> complex128 {
 
 
 phase_complex32 :: proc "contextless" (x:  complex32) -> f16 {
-	return math.atan2(imag(x), real(x))
+	return f16(phase_complex64(complex64(x)))
 }
 phase_complex64 :: proc "contextless" (x:  complex64) -> f32 {
 	return math.atan2(imag(x), real(x))
@@ -432,8 +383,7 @@ phase_complex128 :: proc "contextless" (x:  complex128) -> f64 {
 
 
 rect_complex32 :: proc "contextless" (r, θ: f16) -> complex32 {
-	s, c := math.sincos(θ)
-	return complex(r*c, r*s)
+	return complex32(rect_complex64(f32(r), f32(θ)))
 }
 rect_complex64 :: proc "contextless" (r, θ: f32) -> complex64 {
 	s, c := math.sincos(θ)
diff --git a/core/math/cmplx/cmplx_invtrig.odin b/core/math/cmplx/cmplx_invtrig.odin
index b84f0ac9c..40a8493bc 100644
--- a/core/math/cmplx/cmplx_invtrig.odin
+++ b/core/math/cmplx/cmplx_invtrig.odin
@@ -61,8 +61,7 @@ atanh :: proc{
 
 
 acos_complex32 :: proc "contextless" (x: complex32) -> complex32 {
-	w := asin(x)
-	return complex(math.PI/2 - real(w), -imag(w))
+	return complex32(acos_complex64(complex64(x)))
 }
 acos_complex64 :: proc "contextless" (x: complex64) -> complex64 {
 	w := asin(x)
@@ -75,14 +74,7 @@ acos_complex128 :: proc "contextless" (x: complex128) -> complex128 {
 
 
 acosh_complex32 :: proc "contextless" (x: complex32) -> complex32 {
-	if x == 0 {
-		return complex(0, math.copy_sign(math.PI/2, imag(x)))
-	}
-	w := acos(x)
-	if imag(w) <= 0 {
-		return complex(-imag(w), real(w))
-	}
-	return complex(imag(w), -real(w))
+	return complex32(acosh_complex64(complex64(x)))
 }
 acosh_complex64 :: proc "contextless" (x: complex64) -> complex64 {
 	if x == 0 {
@@ -257,9 +249,7 @@ atan_complex128 :: proc "contextless" (x: complex128) -> complex128 {
 }
 
 atanh_complex32 :: proc "contextless" (x: complex32) -> complex32 {
-	z := complex(-imag(x), real(x)) // z = i * x
-	z = atan(z)
-	return complex(imag(z), -real(z)) // z = -i * z
+	return complex32(atanh_complex64(complex64(x)))
 }
 atanh_complex64 :: proc "contextless" (x: complex64) -> complex64 {
 	z := complex(-imag(x), real(x)) // z = i * x
diff --git a/core/math/rand/rand.odin b/core/math/rand/rand.odin
index 4fdbad01c..e02f3db80 100644
--- a/core/math/rand/rand.odin
+++ b/core/math/rand/rand.odin
@@ -618,10 +618,16 @@ shuffle :: proc(array: $T/[]$E, gen := context.random_generator) {
 		return
 	}
 
-	for i := i64(n - 1); i > 0; i -= 1 {
+	i := n - 1
+	for ; i > (1<<31 - 2); i -= 1 {
 		j := int63_max(i + 1, gen)
 		array[i], array[j] = array[j], array[i]
 	}
+
+	for ; i > 0; i -= 1 {
+		j := int31_max(i32(i + 1), gen)
+		array[i], array[j] = array[j], array[i]
+	}
 }
 
 /*
diff --git a/core/net/socket_linux.odin b/core/net/socket_linux.odin
index a5d553234..350d3947c 100644
--- a/core/net/socket_linux.odin
+++ b/core/net/socket_linux.odin
@@ -117,7 +117,7 @@ _wrap_os_addr :: proc "contextless" (addr: linux.Sock_Addr_Any)->(Endpoint) {
 _create_socket :: proc(family: Address_Family, protocol: Socket_Protocol) -> (Any_Socket, Network_Error) {
 	family := _unwrap_os_family(family)
 	proto, socktype := _unwrap_os_proto_socktype(protocol)
-	sock, errno := linux.socket(family, socktype, {}, proto)
+	sock, errno := linux.socket(family, socktype, {.CLOEXEC}, proto)
 	if errno != .NONE {
 		return {}, Create_Socket_Error(errno)
 	}
@@ -132,7 +132,7 @@ _dial_tcp_from_endpoint :: proc(endpoint: Endpoint, options := default_tcp_optio
 	}
 	// Create new TCP socket
 	os_sock: linux.Fd
-	os_sock, errno = linux.socket(_unwrap_os_family(family_from_endpoint(endpoint)), .STREAM, {}, .TCP)
+	os_sock, errno = linux.socket(_unwrap_os_family(family_from_endpoint(endpoint)), .STREAM, {.CLOEXEC}, .TCP)
 	if errno != .NONE {
 		// TODO(flysand): should return invalid file descriptor here casted as TCP_Socket
 		return {}, Create_Socket_Error(errno)
@@ -172,7 +172,7 @@ _listen_tcp :: proc(endpoint: Endpoint, backlog := 1000) -> (TCP_Socket, Network
 	ep_address := _unwrap_os_addr(endpoint)
 	// Create TCP socket
 	os_sock: linux.Fd
-	os_sock, errno = linux.socket(ep_family, .STREAM, {}, .TCP)
+	os_sock, errno = linux.socket(ep_family, .STREAM, {.CLOEXEC}, .TCP)
 	if errno != .NONE {
 		// TODO(flysand): should return invalid file descriptor here casted as TCP_Socket
 		return {}, Create_Socket_Error(errno)
diff --git a/core/odin/ast/ast.odin b/core/odin/ast/ast.odin
index 92d00b47c..0ae822e21 100644
--- a/core/odin/ast/ast.odin
+++ b/core/odin/ast/ast.odin
@@ -599,6 +599,7 @@ Field_Flag :: enum {
 	Subtype,
 	By_Ptr,
 	No_Broadcast,
+	No_Capture,
 
 	Results,
 	Tags,
@@ -619,6 +620,7 @@ field_flag_strings := [Field_Flag]string{
 	.Subtype            = "#subtype",
 	.By_Ptr             = "#by_ptr",
 	.No_Broadcast       = "#no_broadcast",
+	.No_Capture         = "#no_capture",
 
 	.Results            = "results",
 	.Tags               = "field tag",
@@ -634,6 +636,7 @@ field_hash_flag_strings := []struct{key: string, flag: Field_Flag}{
 	{"subtype",      .Subtype},
 	{"by_ptr",       .By_Ptr},
 	{"no_broadcast", .No_Broadcast},
+	{"no_capture",   .No_Capture},
 }
 
 
diff --git a/core/odin/parser/parser.odin b/core/odin/parser/parser.odin
index dec892f84..4d045f785 100644
--- a/core/odin/parser/parser.odin
+++ b/core/odin/parser/parser.odin
@@ -2179,22 +2179,25 @@ parse_inlining_operand :: proc(p: ^Parser, lhs: bool, tok: tokenizer.Token) -> ^
 		}
 	}
 
-	#partial switch e in ast.strip_or_return_expr(expr).derived_expr {
-	case ^ast.Proc_Lit:
-		if e.inlining != .None && e.inlining != pi {
-			error(p, expr.pos, "both 'inline' and 'no_inline' cannot be applied to a procedure literal")
+	if expr != nil {
+		#partial switch e in ast.strip_or_return_expr(expr).derived_expr {
+		case ^ast.Proc_Lit:
+			if e.inlining != .None && e.inlining != pi {
+				error(p, expr.pos, "both 'inline' and 'no_inline' cannot be applied to a procedure literal")
+			}
+			e.inlining = pi
+			return expr
+		case ^ast.Call_Expr:
+			if e.inlining != .None && e.inlining != pi {
+				error(p, expr.pos, "both 'inline' and 'no_inline' cannot be applied to a procedure call")
+			}
+			e.inlining = pi
+			return expr
 		}
-		e.inlining = pi
-	case ^ast.Call_Expr:
-		if e.inlining != .None && e.inlining != pi {
-			error(p, expr.pos, "both 'inline' and 'no_inline' cannot be applied to a procedure call")
-		}
-		e.inlining = pi
-	case:
-		error(p, tok.pos, "'%s' must be followed by a procedure literal or call", tok.text)
-		return ast.new(ast.Bad_Expr, tok.pos, expr)
 	}
-	return expr
+
+	error(p, tok.pos, "'%s' must be followed by a procedure literal or call", tok.text)
+	return ast.new(ast.Bad_Expr, tok.pos, expr)
 }
 
 parse_operand :: proc(p: ^Parser, lhs: bool) -> ^ast.Expr {
@@ -2258,18 +2261,18 @@ parse_operand :: proc(p: ^Parser, lhs: bool) -> ^ast.Expr {
 			hp.type = type
 			return hp
 
-		case "file", "line", "procedure", "caller_location":
+		case "file", "directory", "line", "procedure", "caller_location":
 			bd := ast.new(ast.Basic_Directive, tok.pos, end_pos(name))
 			bd.tok  = tok
 			bd.name = name.text
 			return bd
-		case "location", "load", "assert", "defined", "config":
+
+		case "location", "exists", "load", "load_directory", "load_hash", "hash", "assert", "panic", "defined", "config":
 			bd := ast.new(ast.Basic_Directive, tok.pos, end_pos(name))
 			bd.tok  = tok
 			bd.name = name.text
 			return parse_call_expr(p, bd)
 
-
 		case "soa":
 			bd := ast.new(ast.Basic_Directive, tok.pos, end_pos(name))
 			bd.tok  = tok
diff --git a/core/os/os2/dir.odin b/core/os/os2/dir.odin
new file mode 100644
index 000000000..6334ee7b8
--- /dev/null
+++ b/core/os/os2/dir.odin
@@ -0,0 +1,80 @@
+package os2
+
+import "base:runtime"
+import "core:slice"
+
+@(require_results)
+read_directory :: proc(f: ^File, n: int, allocator: runtime.Allocator) -> (files: []File_Info, err: Error) {
+	if f == nil {
+		return nil, .Invalid_File
+	}
+
+	n := n
+	size := n
+	if n <= 0 {
+		n = -1
+		size = 100
+	}
+
+	TEMP_ALLOCATOR_GUARD()
+
+	it := read_directory_iterator_create(f) or_return
+	defer _read_directory_iterator_destroy(&it)
+
+	dfi := make([dynamic]File_Info, 0, size, temp_allocator())
+	defer if err != nil {
+		for fi in dfi {
+			file_info_delete(fi, allocator)
+		}
+	}
+
+	for fi, index in read_directory_iterator(&it) {
+		if n > 0 && index == n {
+			break
+		}
+		append(&dfi, file_info_clone(fi, allocator) or_return)
+	}
+
+	return slice.clone(dfi[:], allocator)
+}
+
+
+@(require_results)
+read_all_directory :: proc(f: ^File, allocator: runtime.Allocator) -> (fi: []File_Info, err: Error) {
+	return read_directory(f, -1, allocator)
+}
+
+@(require_results)
+read_directory_by_path :: proc(path: string, n: int, allocator: runtime.Allocator) -> (fi: []File_Info, err: Error) {
+	f := open(path) or_return
+	defer close(f)
+	return read_directory(f, n, allocator)
+}
+
+@(require_results)
+read_all_directory_by_path :: proc(path: string, allocator: runtime.Allocator) -> (fi: []File_Info, err: Error) {
+	return read_directory_by_path(path, -1, allocator)
+}
+
+
+Read_Directory_Iterator :: struct {
+	f:    ^File,
+	impl: Read_Directory_Iterator_Impl,
+}
+
+
+@(require_results)
+read_directory_iterator_create :: proc(f: ^File) -> (Read_Directory_Iterator, Error) {
+	return _read_directory_iterator_create(f)
+}
+
+read_directory_iterator_destroy :: proc(it: ^Read_Directory_Iterator) {
+	_read_directory_iterator_destroy(it)
+}
+
+
+// NOTE(bill): `File_Info` does not need to deleted on each iteration. Any copies must be manually copied with `file_info_clone`
+@(require_results)
+read_directory_iterator :: proc(it: ^Read_Directory_Iterator) -> (fi: File_Info, index: int, ok: bool) {
+	return _read_directory_iterator(it)
+}
diff --git a/core/os/os2/dir_linux.odin b/core/os/os2/dir_linux.odin
new file mode 100644
index 000000000..d4f62e213
--- /dev/null
+++ b/core/os/os2/dir_linux.odin
@@ -0,0 +1,20 @@
+//+private
+package os2
+
+Read_Directory_Iterator_Impl :: struct {
+
+}
+
+
+@(require_results)
+_read_directory_iterator :: proc(it: ^Read_Directory_Iterator) -> (fi: File_Info, index: int, ok: bool) {
+	return
+}
+
+@(require_results)
+_read_directory_iterator_create :: proc(f: ^File) -> (Read_Directory_Iterator, Error) {
+	return {}, nil
+}
+
+_read_directory_iterator_destroy :: proc(it: ^Read_Directory_Iterator) {
+}
diff --git a/core/os/os2/dir_windows.odin b/core/os/os2/dir_windows.odin
new file mode 100644
index 000000000..84f320095
--- /dev/null
+++ b/core/os/os2/dir_windows.odin
@@ -0,0 +1,141 @@
+//+private
+package os2
+
+import "base:runtime"
+import "core:time"
+import win32 "core:sys/windows"
+
+@(private="file")
+find_data_to_file_info :: proc(base_path: string, d: ^win32.WIN32_FIND_DATAW, allocator: runtime.Allocator) -> (fi: File_Info, err: Error) {
+	// Ignore "." and ".."
+	if d.cFileName[0] == '.' && d.cFileName[1] == 0 {
+		return
+	}
+	if d.cFileName[0] == '.' && d.cFileName[1] == '.' && d.cFileName[2] == 0 {
+		return
+	}
+	path := concatenate({base_path, `\`, win32_utf16_to_utf8(d.cFileName[:], temp_allocator()) or_else ""}, allocator) or_return
+
+
+	fi.fullpath = path
+	fi.name = basename(path)
+	fi.size = i64(d.nFileSizeHigh)<<32 + i64(d.nFileSizeLow)
+
+	fi.type, fi.mode = _file_type_mode_from_file_attributes(d.dwFileAttributes, nil, d.dwReserved0)
+
+	fi.creation_time     = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftCreationTime))
+	fi.modification_time = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftLastWriteTime))
+	fi.access_time       = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftLastAccessTime))
+
+
+	handle := win32.HANDLE(_open_internal(path, {.Read}, 0o666) or_else 0)
+	defer win32.CloseHandle(handle)
+
+	if file_id_info: win32.FILE_ID_INFO; handle != nil && win32.GetFileInformationByHandleEx(handle, .FileIdInfo, &file_id_info, size_of(file_id_info)) {
+		#assert(size_of(fi.inode) == size_of(file_id_info.FileId))
+		#assert(size_of(fi.inode) == 16)
+		runtime.mem_copy_non_overlapping(&fi.inode, &file_id_info.FileId, 16)
+	}
+
+
+	return
+}
+
+Read_Directory_Iterator_Impl :: struct {
+	find_data:     win32.WIN32_FIND_DATAW,
+	find_handle:   win32.HANDLE,
+	path:          string,
+	prev_fi:       File_Info,
+	no_more_files: bool,
+	index:         int,
+}
+
+
+@(require_results)
+_read_directory_iterator :: proc(it: ^Read_Directory_Iterator) -> (fi: File_Info, index: int, ok: bool) {
+	if it.f == nil {
+		return
+	}
+
+	TEMP_ALLOCATOR_GUARD()
+
+	for !it.impl.no_more_files {
+		err: Error
+		file_info_delete(it.impl.prev_fi, file_allocator())
+		it.impl.prev_fi = {}
+
+		fi, err = find_data_to_file_info(it.impl.path, &it.impl.find_data, file_allocator())
+		if err != nil {
+			return
+		}
+		if fi.name != "" {
+			it.impl.prev_fi = fi
+			ok = true
+			index = it.impl.index
+			it.impl.index += 1
+		}
+
+		if !win32.FindNextFileW(it.impl.find_handle, &it.impl.find_data) {
+			e := _get_platform_error()
+			if pe, _ := is_platform_error(e); pe == i32(win32.ERROR_NO_MORE_FILES) {
+				it.impl.no_more_files = true
+			}
+			it.impl.no_more_files = true
+		}
+		if ok {
+			return
+		}
+	}
+	return
+}
+
+@(require_results)
+_read_directory_iterator_create :: proc(f: ^File) -> (it: Read_Directory_Iterator, err: Error) {
+	if f == nil {
+		return
+	}
+	it.f = f
+	impl := (^File_Impl)(f.impl)
+
+	if !is_directory(impl.name) {
+		err = .Invalid_Dir
+		return
+	}
+
+	wpath: []u16
+	{
+		i := 0
+		for impl.wname[i] != 0 {
+			i += 1
+		}
+		wpath = impl.wname[:i]
+	}
+
+	TEMP_ALLOCATOR_GUARD()
+
+	wpath_search := make([]u16, len(wpath)+3, temp_allocator())
+	copy(wpath_search, wpath)
+	wpath_search[len(wpath)+0] = '\\'
+	wpath_search[len(wpath)+1] = '*'
+	wpath_search[len(wpath)+2] = 0
+
+	it.impl.find_handle = win32.FindFirstFileW(raw_data(wpath_search), &it.impl.find_data)
+	if it.impl.find_handle == win32.INVALID_HANDLE_VALUE {
+		err = _get_platform_error()
+		return
+	}
+	defer if err != nil {
+		win32.FindClose(it.impl.find_handle)
+	}
+
+	it.impl.path = _cleanpath_from_buf(wpath, file_allocator()) or_return
+	return
+}
+
+_read_directory_iterator_destroy :: proc(it: ^Read_Directory_Iterator) {
+	if it.f == nil {
+		return
+	}
+	file_info_delete(it.impl.prev_fi, file_allocator())
+	win32.FindClose(it.impl.find_handle)
+}
\ No newline at end of file
diff --git a/core/os/os2/env_windows.odin b/core/os/os2/env_windows.odin
index 39694b821..870b5a731 100644
--- a/core/os/os2/env_windows.odin
+++ b/core/os/os2/env_windows.odin
@@ -8,7 +8,8 @@ _lookup_env :: proc(key: string, allocator: runtime.Allocator) -> (value: string
 	if key == "" {
 		return
 	}
-	wkey := win32.utf8_to_wstring(key)
+	TEMP_ALLOCATOR_GUARD()
+	wkey, _ := win32_utf8_to_wstring(key, temp_allocator())
 
 	n := win32.GetEnvironmentVariableW(wkey, nil, 0)
 	if n == 0 {
@@ -32,20 +33,22 @@ _lookup_env :: proc(key: string, allocator: runtime.Allocator) -> (value: string
 		return "", false
 	}
 
-	value = win32.utf16_to_utf8(b[:n], allocator) or_else ""
+	value = win32_utf16_to_utf8(b[:n], allocator) or_else ""
 	found = true
 	return
 }
 
 _set_env :: proc(key, value: string) -> bool {
-	k := win32.utf8_to_wstring(key)
-	v := win32.utf8_to_wstring(value)
+	TEMP_ALLOCATOR_GUARD()
+	k, _ := win32_utf8_to_wstring(key,   temp_allocator())
+	v, _ := win32_utf8_to_wstring(value, temp_allocator())
 
 	return bool(win32.SetEnvironmentVariableW(k, v))
 }
 
 _unset_env :: proc(key: string) -> bool {
-	k := win32.utf8_to_wstring(key)
+	TEMP_ALLOCATOR_GUARD()
+	k, _ := win32_utf8_to_wstring(key, temp_allocator())
 	return bool(win32.SetEnvironmentVariableW(k, nil))
 }
 
@@ -89,7 +92,7 @@ _environ :: proc(allocator: runtime.Allocator) -> []string {
 				break
 			}
 			w := ([^]u16)(p)[from:i]
-			append(&r, win32.utf16_to_utf8(w, allocator) or_else "")
+			append(&r, win32_utf16_to_utf8(w, allocator) or_else "")
 			from = i + 1
 		}
 	}
diff --git a/core/os/os2/errors.odin b/core/os/os2/errors.odin
index 51d8314b4..2b9b3528e 100644
--- a/core/os/os2/errors.odin
+++ b/core/os/os2/errors.odin
@@ -22,6 +22,7 @@ General_Error :: enum u32 {
 	Invalid_File,
 	Invalid_Dir,
 	Invalid_Path,
+	Invalid_Callback,
 
 	Pattern_Has_Separator,
 
@@ -38,6 +39,8 @@ Error :: union #shared_nil {
 }
 #assert(size_of(Error) == size_of(u64))
 
+ERROR_NONE :: Error{}
+
 
 
 is_platform_error :: proc(ferr: Error) -> (err: i32, ok: bool) {
@@ -64,6 +67,7 @@ error_string :: proc(ferr: Error) -> string {
 		case .Invalid_File:      return "invalid file"
 		case .Invalid_Dir:       return "invalid directory"
 		case .Invalid_Path:      return "invalid path"
+		case .Invalid_Callback:  return "invalid callback"
 		case .Unsupported:       return "unsupported"
 		case .Pattern_Has_Separator: return "pattern has separator"
 		}
diff --git a/core/os/os2/errors_windows.odin b/core/os/os2/errors_windows.odin
index 6500e7ccc..6421d26ee 100644
--- a/core/os/os2/errors_windows.odin
+++ b/core/os/os2/errors_windows.odin
@@ -1,6 +1,8 @@
 //+private
 package os2
 
+import "base:runtime"
+import "core:slice"
 import win32 "core:sys/windows"
 
 _error_string :: proc(errno: i32) -> string {
@@ -8,9 +10,14 @@ _error_string :: proc(errno: i32) -> string {
 	if e == 0 {
 		return ""
 	}
-	// TODO(bill): _error_string for windows
-	// FormatMessageW
-	return ""
+
+	err := runtime.Type_Info_Enum_Value(e)
+
+	ti := &runtime.type_info_base(type_info_of(win32.System_Error)).variant.(runtime.Type_Info_Enum)
+	if idx, ok := slice.binary_search(ti.values, err); ok {
+		return ti.names[idx]
+	}
+	return "<unknown platform error>"
 }
 
 _get_platform_error :: proc() -> Error {
diff --git a/core/os/os2/file.odin b/core/os/os2/file.odin
index 236423163..52fd02478 100644
--- a/core/os/os2/file.odin
+++ b/core/os/os2/file.odin
@@ -4,20 +4,57 @@ import "core:io"
 import "core:time"
 import "base:runtime"
 
+/*
+	Type representing a file handle.
+
+	This struct represents an OS-specific file-handle, which can be one of
+	the following:
+	- File
+	- Directory
+	- Pipe
+	- Named pipe
+	- Block Device
+	- Character device
+	- Symlink
+	- Socket
+
+	See `File_Type` enum for more information on file types.
+*/
 File :: struct {
-	impl:   _File,
+	impl:   rawptr,
 	stream: io.Stream,
-	user_fstat: Fstat_Callback,
+	fstat:  Fstat_Callback,
 }
 
-File_Mode :: distinct u32
-File_Mode_Dir         :: File_Mode(1<<16)
-File_Mode_Named_Pipe  :: File_Mode(1<<17)
-File_Mode_Device      :: File_Mode(1<<18)
-File_Mode_Char_Device :: File_Mode(1<<19)
-File_Mode_Sym_Link    :: File_Mode(1<<20)
+/*
+	Type representing the type of a file handle.
 
-File_Mode_Perm :: File_Mode(0o777) // Unix permision bits
+	**Note(windows)**: Socket handles can not be distinguished from
+	files, as they are just a normal file handle that is being treated by
+	a special driver. Windows also makes no distinction between block and
+	character devices.
+*/
+File_Type :: enum {
+	// The type of a file could not be determined for the current platform.
+	Undetermined,
+	// Represents a regular file.
+	Regular,
+	// Represents a directory.
+	Directory,
+	// Represents a symbolic link.
+	Symlink,
+	// Represents a named pipe (FIFO).
+	Named_Pipe,
+	// Represents a socket.
+	// **Note(windows)**: Not returned on windows
+	Socket,
+	// Represents a block device.
+	// **Note(windows)**: On windows represents all devices.
+	Block_Device,
+	// Represents a character device.
+	// **Note(windows)**: Not returned on windows
+	Character_Device,
+}
 
 File_Flags :: distinct bit_set[File_Flag; uint]
 File_Flag :: enum {
@@ -29,7 +66,7 @@ File_Flag :: enum {
 	Sync,
 	Trunc,
 	Sparse,
-	Close_On_Exec,
+	Inheritable,
 
 	Unbuffered_IO,
 }
@@ -43,7 +80,15 @@ O_EXCL    :: File_Flags{.Excl}
 O_SYNC    :: File_Flags{.Sync}
 O_TRUNC   :: File_Flags{.Trunc}
 O_SPARSE  :: File_Flags{.Sparse}
-O_CLOEXEC :: File_Flags{.Close_On_Exec}
+
+/*
+	If specified, the file handle is inherited upon the creation of a child
+	process. By default all handles are created non-inheritable.
+
+	**Note**: The standard file handles (stderr, stdout and stdin) are always
+	initialized as inheritable.
+*/
+O_INHERITABLE :: File_Flags{.Inheritable}
 
 stdin:  ^File = nil // OS-Specific
 stdout: ^File = nil // OS-Specific
@@ -51,17 +96,17 @@ stderr: ^File = nil // OS-Specific
 
 @(require_results)
 create :: proc(name: string) -> (^File, Error) {
-	return open(name, {.Read, .Write, .Create}, File_Mode(0o777))
+	return open(name, {.Read, .Write, .Create}, 0o777)
 }
 
 @(require_results)
-open :: proc(name: string, flags := File_Flags{.Read}, perm := File_Mode(0o777)) -> (^File, Error) {
+open :: proc(name: string, flags := File_Flags{.Read}, perm := 0o777) -> (^File, Error) {
 	return _open(name, flags, perm)
 }
 
 @(require_results)
 new_file :: proc(handle: uintptr, name: string) -> ^File {
-	return _new_file(handle, name)
+	return _new_file(handle, name) or_else panic("Out of memory")
 }
 
 @(require_results)
@@ -161,44 +206,56 @@ read_link :: proc(name: string, allocator: runtime.Allocator) -> (string, Error)
 
 
 chdir :: change_directory
+
 change_directory :: proc(name: string) -> Error {
 	return _chdir(name)
 }
 
 chmod :: change_mode
-change_mode :: proc(name: string, mode: File_Mode) -> Error {
+
+change_mode :: proc(name: string, mode: int) -> Error {
 	return _chmod(name, mode)
 }
+
 chown :: change_owner
+
 change_owner :: proc(name: string, uid, gid: int) -> Error {
 	return _chown(name, uid, gid)
 }
 
 fchdir :: fchange_directory
+
 fchange_directory :: proc(f: ^File) -> Error {
 	return _fchdir(f)
 }
+
 fchmod :: fchange_mode
-fchange_mode :: proc(f: ^File, mode: File_Mode) -> Error {
+
+fchange_mode :: proc(f: ^File, mode: int) -> Error {
 	return _fchmod(f, mode)
 }
 
 fchown :: fchange_owner
+
 fchange_owner :: proc(f: ^File, uid, gid: int) -> Error {
 	return _fchown(f, uid, gid)
 }
 
 
 lchown :: change_owner_do_not_follow_links
+
 change_owner_do_not_follow_links :: proc(name: string, uid, gid: int) -> Error {
 	return _lchown(name, uid, gid)
 }
 
 chtimes :: change_times
+
 change_times :: proc(name: string, atime, mtime: time.Time) -> Error {
 	return _chtimes(name, atime, mtime)
 }
+
 fchtimes :: fchange_times
+
 fchange_times :: proc(f: ^File, atime, mtime: time.Time) -> Error {
 	return _fchtimes(f, atime, mtime)
 }
@@ -210,13 +267,24 @@ exists :: proc(path: string) -> bool {
 
 @(require_results)
 is_file :: proc(path: string) -> bool {
-	return _is_file(path)
+	TEMP_ALLOCATOR_GUARD()
+	fi, err := stat(path, temp_allocator())
+	if err != nil {
+		return false
+	}
+	return fi.type == .Regular
 }
 
 is_dir :: is_directory
+
 @(require_results)
 is_directory :: proc(path: string) -> bool {
-	return _is_dir(path)
+	TEMP_ALLOCATOR_GUARD()
+	fi, err := stat(path, temp_allocator())
+	if err != nil {
+		return false
+	}
+	return fi.type == .Directory
 }
 
 
@@ -226,11 +294,11 @@ copy_file :: proc(dst_path, src_path: string) -> Error {
 
 	info := fstat(src, file_allocator()) or_return
 	defer file_info_delete(info, file_allocator())
-	if info.is_directory {
+	if info.type == .Directory {
 		return .Invalid_File
 	}
 
-	dst := open(dst_path, {.Read, .Write, .Create, .Trunc}, info.mode & File_Mode_Perm) or_return
+	dst := open(dst_path, {.Read, .Write, .Create, .Trunc}, info.mode & 0o777) or_return
 	defer close(dst)
 
 	_, err := io.copy(to_writer(dst), to_reader(src))
diff --git a/core/os/os2/file_linux.odin b/core/os/os2/file_linux.odin
index 8e7db9751..d2a7483ca 100644
--- a/core/os/os2/file_linux.odin
+++ b/core/os/os2/file_linux.odin
@@ -6,14 +6,15 @@ import "core:time"
 import "base:runtime"
 import "core:sys/linux"
 
-_File :: struct {
+File_Impl :: struct {
+	file: File,
 	name: string,
 	fd: linux.Fd,
 	allocator: runtime.Allocator,
 }
 
-_stdin : File = {
-	impl = {
+_stdin := File{
+	impl = &File_Impl{
 		name = "/proc/self/fd/0",
 		fd = 0,
 		allocator = _file_allocator(),
@@ -21,9 +22,10 @@ _stdin : File = {
 	stream = {
 		procedure = _file_stream_proc,
 	},
+	fstat = _fstat,
 }
-_stdout : File = {
-	impl = {
+_stdout := File{
+	impl = &File_Impl{
 		name = "/proc/self/fd/1",
 		fd = 1,
 		allocator = _file_allocator(),
@@ -31,9 +33,10 @@ _stdout : File = {
 	stream = {
 		procedure = _file_stream_proc,
 	},
+	fstat = _fstat,
 }
-_stderr : File = {
-	impl = {
+_stderr := File{
+	impl = &File_Impl{
 		name = "/proc/self/fd/2",
 		fd = 2,
 		allocator = _file_allocator(),
@@ -41,6 +44,7 @@ _stderr : File = {
 	stream = {
 		procedure = _file_stream_proc,
 	},
+	fstat = _fstat,
 }
 
 @init
@@ -59,70 +63,67 @@ _file_allocator :: proc() -> runtime.Allocator {
 	return heap_allocator()
 }
 
-_open :: proc(name: string, flags: File_Flags, perm: File_Mode) -> (f: ^File, err: Error) {
+_open :: proc(name: string, flags: File_Flags, perm: int) -> (f: ^File, err: Error) {
 	TEMP_ALLOCATOR_GUARD()
 	name_cstr := temp_cstring(name) or_return
 
 	// Just default to using O_NOCTTY because needing to open a controlling
 	// terminal would be incredibly rare. This has no effect on files while
 	// allowing us to open serial devices.
-	sys_flags: linux.Open_Flags = {.NOCTTY}
+	sys_flags: linux.Open_Flags = {.NOCTTY, .CLOEXEC}
 	switch flags & O_RDONLY|O_WRONLY|O_RDWR {
 	case O_RDONLY:
 	case O_WRONLY: sys_flags += {.WRONLY}
 	case O_RDWR:   sys_flags += {.RDWR}
 	}
-
 	if .Append in flags        { sys_flags += {.APPEND} }
 	if .Create in flags        { sys_flags += {.CREAT} }
 	if .Excl in flags          { sys_flags += {.EXCL} }
 	if .Sync in flags          { sys_flags += {.DSYNC} }
 	if .Trunc in flags         { sys_flags += {.TRUNC} }
-	if .Close_On_Exec in flags { sys_flags += {.CLOEXEC} }
+	if .Inheritable in flags   { sys_flags -= {.CLOEXEC} }
 
-	fd, errno := linux.open(name_cstr, sys_flags, transmute(linux.Mode)(u32(perm)))
+	fd, errno := linux.open(name_cstr, sys_flags, transmute(linux.Mode)u32(perm))
 	if errno != .NONE {
 		return nil, _get_platform_error(errno)
 	}
 
-	return _new_file(uintptr(fd), name), nil
+	return _new_file(uintptr(fd), name)
 }
 
-_new_file :: proc(fd: uintptr, _: string = "") -> ^File {
-	file := new(File, file_allocator())
-	_construct_file(file, fd, "")
-	return file
-}
-
-_construct_file :: proc(file: ^File, fd: uintptr, _: string = "") {
-	file^ = {
-		impl = {
-			fd = linux.Fd(fd),
-			allocator = file_allocator(),
-			name = _get_full_path(file.impl.fd, file.impl.allocator),
-		},
-		stream = {
-			data = file,
-			procedure = _file_stream_proc,
-		},
+_new_file :: proc(fd: uintptr, _: string = "") -> (f: ^File, err: Error) {
+	impl := new(File_Impl, file_allocator()) or_return
+	defer if err != nil {
+		free(impl, file_allocator())
 	}
+	impl.file.impl = impl
+	impl.fd = linux.Fd(fd)
+	impl.allocator = file_allocator()
+	impl.name = _get_full_path(impl.fd, file_allocator()) or_return
+	impl.file.stream = {
+		data = impl,
+		procedure = _file_stream_proc,
+	}
+	impl.file.fstat = _fstat
+	return &impl.file, nil
 }
 
-_destroy :: proc(f: ^File) -> Error {
+_destroy :: proc(f: ^File_Impl) -> Error {
 	if f == nil {
 		return nil
 	}
-	delete(f.impl.name, f.impl.allocator)
-	free(f, f.impl.allocator)
+	a := f.allocator
+	delete(f.name, a)
+	free(f, a)
 	return nil
 }
 
 
-_close :: proc(f: ^File) -> Error {
-	if f == nil {
+_close :: proc(f: ^File_Impl) -> Error {
+	if f == nil{
 		return nil
 	}
-	errno := linux.close(f.impl.fd)
+	errno := linux.close(f.fd)
 	if errno == .EBADF { // avoid possible double free
 		return _get_platform_error(errno)
 	}
@@ -131,41 +132,41 @@ _close :: proc(f: ^File) -> Error {
 }
 
 _fd :: proc(f: ^File) -> uintptr {
-	if f == nil {
+	if f == nil || f.impl == nil {
 		return ~uintptr(0)
 	}
-	return uintptr(f.impl.fd)
+	impl := (^File_Impl)(f.impl)
+	return uintptr(impl.fd)
 }
 
 _name :: proc(f: ^File) -> string {
-	return f.impl.name if f != nil else ""
+	return (^File_Impl)(f.impl).name if f != nil && f.impl != nil else ""
 }
 
-_seek :: proc(f: ^File, offset: i64, whence: io.Seek_From) -> (ret: i64, err: Error) {
-	n, errno := linux.lseek(f.impl.fd, offset, linux.Seek_Whence(whence))
+_seek :: proc(f: ^File_Impl, offset: i64, whence: io.Seek_From) -> (ret: i64, err: Error) {
+	n, errno := linux.lseek(f.fd, offset, linux.Seek_Whence(whence))
 	if errno != .NONE {
 		return -1, _get_platform_error(errno)
 	}
 	return n, nil
 }
 
-_read :: proc(f: ^File, p: []byte) -> (i64, Error) {
+_read :: proc(f: ^File_Impl, p: []byte) -> (i64, Error) {
 	if len(p) == 0 {
 		return 0, nil
 	}
-	n, errno := linux.read(f.impl.fd, p[:])
+	n, errno := linux.read(f.fd, p[:])
 	if errno != .NONE {
 		return -1, _get_platform_error(errno)
 	}
-	return i64(n), n == 0 ? io.Error.EOF : nil
+	return i64(n), io.Error.EOF if n == 0 else nil
 }
 
-_read_at :: proc(f: ^File, p: []byte, offset: i64) -> (i64, Error) {
+_read_at :: proc(f: ^File_Impl, p: []byte, offset: i64) -> (i64, Error) {
 	if offset < 0 {
 		return 0, .Invalid_Offset
 	}
-
-	n, errno := linux.pread(f.impl.fd, p[:], offset)
+	n, errno := linux.pread(f.fd, p[:], offset)
 	if errno != .NONE {
 		return -1, _get_platform_error(errno)
 	}
@@ -175,32 +176,31 @@ _read_at :: proc(f: ^File, p: []byte, offset: i64) -> (i64, Error) {
 	return i64(n), nil
 }
 
-_write :: proc(f: ^File, p: []byte) -> (i64, Error) {
+_write :: proc(f: ^File_Impl, p: []byte) -> (i64, Error) {
 	if len(p) == 0 {
 		return 0, nil
 	}
-	n, errno := linux.write(f.impl.fd, p[:])
+	n, errno := linux.write(f.fd, p[:])
 	if errno != .NONE {
 		return -1, _get_platform_error(errno)
 	}
 	return i64(n), nil
 }
 
-_write_at :: proc(f: ^File, p: []byte, offset: i64) -> (i64, Error) {
+_write_at :: proc(f: ^File_Impl, p: []byte, offset: i64) -> (i64, Error) {
 	if offset < 0 {
 		return 0, .Invalid_Offset
 	}
-
-	n, errno := linux.pwrite(f.impl.fd, p[:], offset)
+	n, errno := linux.pwrite(f.fd, p[:], offset)
 	if errno != .NONE {
 		return -1, _get_platform_error(errno)
 	}
 	return i64(n), nil
 }
 
-_file_size :: proc(f: ^File) -> (n: i64, err: Error) {
+_file_size :: proc(f: ^File_Impl) -> (n: i64, err: Error) {
 	s: linux.Stat = ---
-	errno := linux.fstat(f.impl.fd, &s)
+	errno := linux.fstat(f.fd, &s)
 	if errno != .NONE {
 		return -1, _get_platform_error(errno)
 	}
@@ -208,27 +208,38 @@ _file_size :: proc(f: ^File) -> (n: i64, err: Error) {
 }
 
 _sync :: proc(f: ^File) -> Error {
-	return _get_platform_error(linux.fsync(f.impl.fd))
+	impl := (^File_Impl)(f.impl)
+	return _get_platform_error(linux.fsync(impl.fd))
 }
 
-_flush :: proc(f: ^File) -> Error {
-	return _get_platform_error(linux.fsync(f.impl.fd))
+_flush :: proc(f: ^File_Impl) -> Error {
+	return _get_platform_error(linux.fsync(f.fd))
 }
 
 _truncate :: proc(f: ^File, size: i64) -> Error {
-	return _get_platform_error(linux.ftruncate(f.impl.fd, size))
+	impl := (^File_Impl)(f.impl)
+	return _get_platform_error(linux.ftruncate(impl.fd, size))
 }
 
 _remove :: proc(name: string) -> Error {
+	is_dir_fd :: proc(fd: linux.Fd) -> bool {
+		s: linux.Stat
+		if linux.fstat(fd, &s) != .NONE {
+			return false
+		}
+		return linux.S_ISDIR(s.mode)
+	}
+
 	TEMP_ALLOCATOR_GUARD()
 	name_cstr := temp_cstring(name) or_return
 
 	fd, errno := linux.open(name_cstr, {.NOFOLLOW})
 	#partial switch (errno) {
-	case .ELOOP: /* symlink */
+	case .ELOOP:
+		/* symlink */
 	case .NONE:
 		defer linux.close(fd)
-		if _is_dir_fd(fd) {
+		if is_dir_fd(fd) {
 			return _get_platform_error(linux.rmdir(name_cstr))
 		}
 	case:
@@ -292,17 +303,19 @@ _chdir :: proc(name: string) -> Error {
 }
 
 _fchdir :: proc(f: ^File) -> Error {
-	return _get_platform_error(linux.fchdir(f.impl.fd))
+	impl := (^File_Impl)(f.impl)
+	return _get_platform_error(linux.fchdir(impl.fd))
 }
 
-_chmod :: proc(name: string, mode: File_Mode) -> Error {
+_chmod :: proc(name: string, mode: int) -> Error {
 	TEMP_ALLOCATOR_GUARD()
 	name_cstr := temp_cstring(name) or_return
 	return _get_platform_error(linux.chmod(name_cstr, transmute(linux.Mode)(u32(mode))))
 }
 
-_fchmod :: proc(f: ^File, mode: File_Mode) -> Error {
-	return _get_platform_error(linux.fchmod(f.impl.fd, transmute(linux.Mode)(u32(mode))))
+_fchmod :: proc(f: ^File, mode: int) -> Error {
+	impl := (^File_Impl)(f.impl)
+	return _get_platform_error(linux.fchmod(impl.fd, transmute(linux.Mode)(u32(mode))))
 }
 
 // NOTE: will throw error without super user priviledges
@@ -321,7 +334,8 @@ _lchown :: proc(name: string, uid, gid: int) -> Error {
 
 // NOTE: will throw error without super user priviledges
 _fchown :: proc(f: ^File, uid, gid: int) -> Error {
-	return _get_platform_error(linux.fchown(f.impl.fd, linux.Uid(uid), linux.Gid(gid)))
+	impl := (^File_Impl)(f.impl)
+	return _get_platform_error(linux.fchown(impl.fd, linux.Uid(uid), linux.Gid(gid)))
 }
 
 _chtimes :: proc(name: string, atime, mtime: time.Time) -> Error {
@@ -351,7 +365,8 @@ _fchtimes :: proc(f: ^File, atime, mtime: time.Time) -> Error {
 			uint(mtime._nsec) % uint(time.Second),
 		},
 	}
-	return _get_platform_error(linux.utimensat(f.impl.fd, nil, &times[0], nil))
+	impl := (^File_Impl)(f.impl)
+	return _get_platform_error(linux.utimensat(impl.fd, nil, &times[0], nil))
 }
 
 _exists :: proc(name: string) -> bool {
@@ -361,42 +376,6 @@ _exists :: proc(name: string) -> bool {
 	return !res && errno == .NONE
 }
 
-_is_file :: proc(name: string) -> bool {
-	TEMP_ALLOCATOR_GUARD()
-	name_cstr, _ := temp_cstring(name)
-	s: linux.Stat
-	if linux.stat(name_cstr, &s) != .NONE {
-		return false
-	}
-	return linux.S_ISREG(s.mode)
-}
-
-_is_file_fd :: proc(fd: linux.Fd) -> bool {
-	s: linux.Stat
-	if linux.fstat(fd, &s) != .NONE {
-		return false
-	}
-	return linux.S_ISREG(s.mode)
-}
-
-_is_dir :: proc(name: string) -> bool {
-	TEMP_ALLOCATOR_GUARD()
-	name_cstr, _ := temp_cstring(name)
-	s: linux.Stat
-	if linux.stat(name_cstr, &s) != .NONE {
-		return false
-	}
-	return linux.S_ISDIR(s.mode)
-}
-
-_is_dir_fd :: proc(fd: linux.Fd) -> bool {
-	s: linux.Stat
-	if linux.fstat(fd, &s) != .NONE {
-		return false
-	}
-	return linux.S_ISDIR(s.mode)
-}
-
 /* Certain files in the Linux file system are not actual
  * files (e.g. everything in /proc/). Therefore, the
  * read_entire_file procs fail to actually read anything
@@ -443,7 +422,7 @@ _read_entire_pseudo_file_cstring :: proc(name: cstring, allocator: runtime.Alloc
 
 @(private="package")
 _file_stream_proc :: proc(stream_data: rawptr, mode: io.Stream_Mode, p: []byte, offset: i64, whence: io.Seek_From) -> (n: i64, err: io.Error) {
-	f := (^File)(stream_data)
+	f := (^File_Impl)(stream_data)
 	ferr: Error
 	switch mode {
 	case .Read:
diff --git a/core/os/os2/file_util.odin b/core/os/os2/file_util.odin
index 977979bae..2011d1cc4 100644
--- a/core/os/os2/file_util.odin
+++ b/core/os/os2/file_util.odin
@@ -8,6 +8,18 @@ write_string :: proc(f: ^File, s: string) -> (n: int, err: Error) {
 	return write(f, transmute([]byte)s)
 }
 
+write_strings :: proc(f: ^File, strings: ..string) -> (n: int, err: Error) {
+	for s in strings {
+		m: int
+		m, err = write_string(f, s)
+		n += m
+		if err != nil {
+			return
+		}
+	}
+	return
+}
+
 write_byte :: proc(f: ^File, b: byte) -> (n: int, err: Error) {
 	return write(f, []byte{b})
 }
@@ -138,7 +150,7 @@ read_entire_file_from_file :: proc(f: ^File, allocator: runtime.Allocator) -> (d
 }
 
 @(require_results)
-write_entire_file :: proc(name: string, data: []byte, perm: File_Mode, truncate := true) -> Error {
+write_entire_file :: proc(name: string, data: []byte, perm: int, truncate := true) -> Error {
 	flags := O_WRONLY|O_CREATE
 	if truncate {
 		flags |= O_TRUNC
diff --git a/core/os/os2/file_windows.odin b/core/os/os2/file_windows.odin
index 37f8f44de..48a5427f1 100644
--- a/core/os/os2/file_windows.odin
+++ b/core/os/os2/file_windows.odin
@@ -17,17 +17,19 @@ _ERROR_BAD_NETPATH :: 53
 MAX_RW :: 1<<30
 
 
-_File_Kind :: enum u8 {
+File_Impl_Kind :: enum u8 {
 	File,
 	Console,
 	Pipe,
 }
 
-_File :: struct {
+File_Impl :: struct {
+	file: File,
+
 	fd:   rawptr,
 	name: string,
 	wname: win32.wstring,
-	kind: _File_Kind,
+	kind: File_Impl_Kind,
 
 	allocator: runtime.Allocator,
 
@@ -53,13 +55,14 @@ _handle :: proc(f: ^File) -> win32.HANDLE {
 	return win32.HANDLE(_fd(f))
 }
 
-_open_internal :: proc(name: string, flags: File_Flags, perm: File_Mode) -> (handle: uintptr, err: Error) {
+_open_internal :: proc(name: string, flags: File_Flags, perm: int) -> (handle: uintptr, err: Error) {
 	if len(name) == 0 {
 		err = .Not_Exist
 		return
 	}
+	TEMP_ALLOCATOR_GUARD()
 
-	path := _fix_long_path(name)
+	path := _fix_long_path(name, temp_allocator()) or_return
 	access: u32
 	switch flags & {.Read, .Write} {
 	case {.Read}:         access = win32.FILE_GENERIC_READ
@@ -75,11 +78,9 @@ _open_internal :: proc(name: string, flags: File_Flags, perm: File_Mode) -> (han
 		access |= win32.FILE_APPEND_DATA
 	}
 	share_mode := u32(win32.FILE_SHARE_READ | win32.FILE_SHARE_WRITE)
-	sa: ^win32.SECURITY_ATTRIBUTES
-	if .Close_On_Exec not_in flags {
-		sa = &win32.SECURITY_ATTRIBUTES{}
-		sa.nLength = size_of(win32.SECURITY_ATTRIBUTES)
-		sa.bInheritHandle = true
+	sa := win32.SECURITY_ATTRIBUTES {
+		nLength = size_of(win32.SECURITY_ATTRIBUTES),
+		bInheritHandle = .Inheritable in flags,
 	}
 
 	create_mode: u32 = win32.OPEN_EXISTING
@@ -94,14 +95,14 @@ _open_internal :: proc(name: string, flags: File_Flags, perm: File_Mode) -> (han
 		create_mode = win32.TRUNCATE_EXISTING
 	}
 
-	attrs: u32 = win32.FILE_ATTRIBUTE_NORMAL
+	attrs: u32 = win32.FILE_ATTRIBUTE_NORMAL|win32.FILE_FLAG_BACKUP_SEMANTICS
 	if perm & S_IWRITE == 0 {
 		attrs = win32.FILE_ATTRIBUTE_READONLY
 		if create_mode == win32.CREATE_ALWAYS {
 			// NOTE(bill): Open has just asked to create a file in read-only mode.
 			// If the file already exists, to make it akin to a *nix open call,
 			// the call preserves the existing permissions.
-			h := win32.CreateFileW(path, access, share_mode, sa, win32.TRUNCATE_EXISTING, win32.FILE_ATTRIBUTE_NORMAL, nil)
+			h := win32.CreateFileW(path, access, share_mode, &sa, win32.TRUNCATE_EXISTING, win32.FILE_ATTRIBUTE_NORMAL, nil)
 			if h == win32.INVALID_HANDLE {
 				switch e := win32.GetLastError(); e {
 				case win32.ERROR_FILE_NOT_FOUND, _ERROR_BAD_NETPATH, win32.ERROR_PATH_NOT_FOUND:
@@ -109,12 +110,13 @@ _open_internal :: proc(name: string, flags: File_Flags, perm: File_Mode) -> (han
 				case 0:
 					return uintptr(h), nil
 				case:
-					return 0, Platform_Error(e)
+					return 0, _get_platform_error()
 				}
 			}
 		}
 	}
-	h := win32.CreateFileW(path, access, share_mode, sa, create_mode, attrs, nil)
+
+	h := win32.CreateFileW(path, access, share_mode, &sa, create_mode, attrs, nil)
 	if h == win32.INVALID_HANDLE {
 		return 0, _get_platform_error()
 	}
@@ -122,85 +124,95 @@ _open_internal :: proc(name: string, flags: File_Flags, perm: File_Mode) -> (han
 }
 
 
-_open :: proc(name: string, flags: File_Flags, perm: File_Mode) -> (f: ^File, err: Error) {
+_open :: proc(name: string, flags: File_Flags, perm: int) -> (f: ^File, err: Error) {
 	flags := flags if flags != nil else {.Read}
-	handle := _open_internal(name, flags + {.Close_On_Exec}, perm) or_return
-	return _new_file(handle, name), nil
+	handle := _open_internal(name, flags, perm) or_return
+	return _new_file(handle, name)
 }
 
-_new_file :: proc(handle: uintptr, name: string) -> ^File {
+_new_file :: proc(handle: uintptr, name: string) -> (f: ^File, err: Error) {
 	if handle == INVALID_HANDLE {
-		return nil
+		return
+	}
+	impl := new(File_Impl, file_allocator()) or_return
+	defer if err != nil {
+		free(impl, file_allocator())
 	}
-	f := new(File, file_allocator())
 
-	f.impl.allocator = file_allocator()
-	f.impl.fd = rawptr(handle)
-	f.impl.name, _ = clone_string(name, f.impl.allocator)
-	f.impl.wname = win32.utf8_to_wstring(name, f.impl.allocator)
+	impl.file.impl = impl
 
-	handle := _handle(f)
-	kind := _File_Kind.File
+	impl.allocator = file_allocator()
+	impl.fd = rawptr(handle)
+	impl.name = clone_string(name, impl.allocator) or_return
+	impl.wname = win32_utf8_to_wstring(name, impl.allocator) or_return
+
+	handle := _handle(&impl.file)
+	kind := File_Impl_Kind.File
 	if m: u32; win32.GetConsoleMode(handle, &m) {
 		kind = .Console
 	}
 	if win32.GetFileType(handle) == win32.FILE_TYPE_PIPE {
 		kind = .Pipe
 	}
-	f.impl.kind = kind
+	impl.kind = kind
 
-	f.stream = {
-		data = f,
+	impl.file.stream = {
+		data = impl,
 		procedure = _file_stream_proc,
 	}
+	impl.file.fstat = _fstat
 
-	return f
+	return &impl.file, nil
 }
 
 _fd :: proc(f: ^File) -> uintptr {
-	if f == nil {
+	if f == nil || f.impl == nil {
 		return INVALID_HANDLE
 	}
-	return uintptr(f.impl.fd)
+	return uintptr((^File_Impl)(f.impl).fd)
 }
 
-_destroy :: proc(f: ^File) -> Error {
+_destroy :: proc(f: ^File_Impl) -> Error {
 	if f == nil {
 		return nil
 	}
 
-	a := f.impl.allocator
-	free(f.impl.wname, a)
-	delete(f.impl.name, a)
-	free(f, a)
+	a := f.allocator
+	err0 := free(f.wname, a)
+	err1 := delete(f.name, a)
+	err2 := free(f, a)
+	err0 or_return
+	err1 or_return
+	err2 or_return
 	return nil
 }
 
 
-_close :: proc(f: ^File) -> Error {
-	if f == nil {
+_close :: proc(f: ^File_Impl) -> Error {
+	if f == nil  {
 		return nil
 	}
-	if !win32.CloseHandle(win32.HANDLE(f.impl.fd)) {
+	if !win32.CloseHandle(win32.HANDLE(f.fd)) {
 		return .Closed
 	}
 	return _destroy(f)
 }
 
 _name :: proc(f: ^File) -> string {
-	return f.impl.name if f != nil else ""
+	return (^File_Impl)(f.impl).name if f != nil && f.impl != nil else ""
 }
 
-_seek :: proc(f: ^File, offset: i64, whence: io.Seek_From) -> (ret: i64, err: Error) {
-	handle := _handle(f)
+_seek :: proc(f: ^File_Impl, offset: i64, whence: io.Seek_From) -> (ret: i64, err: Error) {
+	handle := _handle(&f.file)
 	if handle == win32.INVALID_HANDLE {
 		return 0, .Invalid_File
 	}
-	if f.impl.kind == .Pipe {
+
+	if f.kind == .Pipe {
 		return 0, .Invalid_File
 	}
 
-	sync.guard(&f.impl.rw_mutex)
+	sync.guard(&f.rw_mutex)
 
 	w: u32
 	switch whence {
@@ -218,13 +230,13 @@ _seek :: proc(f: ^File, offset: i64, whence: io.Seek_From) -> (ret: i64, err: Er
 	return i64(hi)<<32 + i64(dw_ptr), nil
 }
 
-_read :: proc(f: ^File, p: []byte) -> (n: i64, err: Error) {
+_read :: proc(f: ^File_Impl, p: []byte) -> (n: i64, err: Error) {
 	read_console :: proc(handle: win32.HANDLE, b: []byte) -> (n: int, err: Error) {
 		if len(b) == 0 {
 			return 0, nil
 		}
 
-		// TODO(bill): should this be moved to `_File` instead?
+		// TODO(bill): should this be moved to `File_Impl` instead?
 		BUF_SIZE :: 386
 		buf16: [BUF_SIZE]u16
 		buf8: [4*BUF_SIZE]u8
@@ -269,18 +281,18 @@ _read :: proc(f: ^File, p: []byte) -> (n: i64, err: Error) {
 		return
 	}
 
-	handle := _handle(f)
+	handle := _handle(&f.file)
 
 	single_read_length: win32.DWORD
 	total_read: int
 	length := len(p)
 
-	sync.shared_guard(&f.impl.rw_mutex) // multiple readers
+	sync.shared_guard(&f.rw_mutex) // multiple readers
 
-	if sync.guard(&f.impl.p_mutex) {
+	if sync.guard(&f.p_mutex) {
 		to_read := min(win32.DWORD(length), MAX_RW)
 		ok: win32.BOOL
-		if f.impl.kind == .Console {
+		if f.kind == .Console {
 			n, cerr := read_console(handle, p[total_read:][:to_read])
 			total_read += n
 			if cerr != nil {
@@ -300,15 +312,15 @@ _read :: proc(f: ^File, p: []byte) -> (n: i64, err: Error) {
 	return i64(total_read), err
 }
 
-_read_at :: proc(f: ^File, p: []byte, offset: i64) -> (n: i64, err: Error) {
-	pread :: proc(f: ^File, data: []byte, offset: i64) -> (n: i64, err: Error) {
+_read_at :: proc(f: ^File_Impl, p: []byte, offset: i64) -> (n: i64, err: Error) {
+	pread :: proc(f: ^File_Impl, data: []byte, offset: i64) -> (n: i64, err: Error) {
 		buf := data
 		if len(buf) > MAX_RW {
 			buf = buf[:MAX_RW]
 
 		}
-		curr_offset := seek(f, offset, .Current) or_return
-		defer seek(f, curr_offset, .Start)
+		curr_offset := _seek(f, offset, .Current) or_return
+		defer _seek(f, curr_offset, .Start)
 
 		o := win32.OVERLAPPED{
 			OffsetHigh = u32(offset>>32),
@@ -317,7 +329,7 @@ _read_at :: proc(f: ^File, p: []byte, offset: i64) -> (n: i64, err: Error) {
 
 		// TODO(bill): Determine the correct behaviour for consoles
 
-		h := _handle(f)
+		h := _handle(&f.file)
 		done: win32.DWORD
 		if !win32.ReadFile(h, raw_data(buf), u32(len(buf)), &done, &o) {
 			err = _get_platform_error()
@@ -327,7 +339,7 @@ _read_at :: proc(f: ^File, p: []byte, offset: i64) -> (n: i64, err: Error) {
 		return
 	}
 
-	sync.guard(&f.impl.p_mutex)
+	sync.guard(&f.p_mutex)
 
 	p, offset := p, offset
 	for len(p) > 0 {
@@ -339,7 +351,7 @@ _read_at :: proc(f: ^File, p: []byte, offset: i64) -> (n: i64, err: Error) {
 	return
 }
 
-_write :: proc(f: ^File, p: []byte) -> (n: i64, err: Error) {
+_write :: proc(f: ^File_Impl, p: []byte) -> (n: i64, err: Error) {
 	if len(p) == 0 {
 		return
 	}
@@ -348,9 +360,9 @@ _write :: proc(f: ^File, p: []byte) -> (n: i64, err: Error) {
 	total_write: i64
 	length := i64(len(p))
 
-	handle := _handle(f)
+	handle := _handle(&f.file)
 
-	sync.guard(&f.impl.rw_mutex)
+	sync.guard(&f.rw_mutex)
 	for total_write < length {
 		remaining := length - total_write
 		to_write := win32.DWORD(min(i32(remaining), MAX_RW))
@@ -366,22 +378,22 @@ _write :: proc(f: ^File, p: []byte) -> (n: i64, err: Error) {
 	return i64(total_write), nil
 }
 
-_write_at :: proc(f: ^File, p: []byte, offset: i64) -> (n: i64, err: Error) {
-	pwrite :: proc(f: ^File, data: []byte, offset: i64) -> (n: i64, err: Error) {
+_write_at :: proc(f: ^File_Impl, p: []byte, offset: i64) -> (n: i64, err: Error) {
+	pwrite :: proc(f: ^File_Impl, data: []byte, offset: i64) -> (n: i64, err: Error) {
 		buf := data
 		if len(buf) > MAX_RW {
 			buf = buf[:MAX_RW]
 
 		}
-		curr_offset := seek(f, offset, .Current) or_return
-		defer seek(f, curr_offset, .Start)
+		curr_offset := _seek(f, offset, .Current) or_return
+		defer _seek(f, curr_offset, .Start)
 
 		o := win32.OVERLAPPED{
 			OffsetHigh = u32(offset>>32),
 			Offset = u32(offset),
 		}
 
-		h := _handle(f)
+		h := _handle(&f.file)
 		done: win32.DWORD
 		if !win32.WriteFile(h, raw_data(buf), u32(len(buf)), &done, &o) {
 			err = _get_platform_error()
@@ -391,7 +403,7 @@ _write_at :: proc(f: ^File, p: []byte, offset: i64) -> (n: i64, err: Error) {
 		return
 	}
 
-	sync.guard(&f.impl.p_mutex)
+	sync.guard(&f.p_mutex)
 	p, offset := p, offset
 	for len(p) > 0 {
 		m := pwrite(f, p, offset) or_return
@@ -402,12 +414,12 @@ _write_at :: proc(f: ^File, p: []byte, offset: i64) -> (n: i64, err: Error) {
 	return
 }
 
-_file_size :: proc(f: ^File) -> (n: i64, err: Error) {
+_file_size :: proc(f: ^File_Impl) -> (n: i64, err: Error) {
 	length: win32.LARGE_INTEGER
-	if f.impl.kind == .Pipe {
+	if f.kind == .Pipe {
 		return 0, .No_Size
 	}
-	handle := _handle(f)
+	handle := _handle(&f.file)
 	if !win32.GetFileSizeEx(handle, &length) {
 		err = _get_platform_error()
 	}
@@ -417,11 +429,14 @@ _file_size :: proc(f: ^File) -> (n: i64, err: Error) {
 
 
 _sync :: proc(f: ^File) -> Error {
-	return _flush(f)
+	if f != nil && f.impl != nil {
+		return _flush((^File_Impl)(f.impl))
+	}
+	return nil
 }
 
-_flush :: proc(f: ^File) -> Error {
-	handle := _handle(f)
+_flush :: proc(f: ^File_Impl) -> Error {
+	handle := _handle(&f.file)
 	if !win32.FlushFileBuffers(handle) {
 		return _get_platform_error()
 	}
@@ -429,7 +444,7 @@ _flush :: proc(f: ^File) -> Error {
 }
 
 _truncate :: proc(f: ^File, size: i64) -> Error {
-	if f == nil {
+	if f == nil || f.impl == nil {
 		return nil
 	}
 	curr_off := seek(f, 0, .Current) or_return
@@ -443,7 +458,8 @@ _truncate :: proc(f: ^File, size: i64) -> Error {
 }
 
 _remove :: proc(name: string) -> Error {
-	p := _fix_long_path(name)
+	TEMP_ALLOCATOR_GUARD()
+	p := _fix_long_path(name, temp_allocator()) or_return
 	err, err1: Error
 	if !win32.DeleteFileW(p) {
 		err = _get_platform_error()
@@ -480,8 +496,9 @@ _remove :: proc(name: string) -> Error {
 }
 
 _rename :: proc(old_path, new_path: string) -> Error {
-	from := _fix_long_path(old_path)
-	to := _fix_long_path(new_path)
+	TEMP_ALLOCATOR_GUARD()
+	from := _fix_long_path(old_path, temp_allocator()) or_return
+	to   := _fix_long_path(new_path, temp_allocator()) or_return
 	if win32.MoveFileExW(from, to, win32.MOVEFILE_REPLACE_EXISTING) {
 		return nil
 	}
@@ -489,10 +506,10 @@ _rename :: proc(old_path, new_path: string) -> Error {
 
 }
 
-
 _link :: proc(old_name, new_name: string) -> Error {
-	o := _fix_long_path(old_name)
-	n := _fix_long_path(new_name)
+	TEMP_ALLOCATOR_GUARD()
+	o := _fix_long_path(old_name, temp_allocator()) or_return
+	n := _fix_long_path(new_name, temp_allocator()) or_return
 	if win32.CreateHardLinkW(n, o, nil) {
 		return nil
 	}
@@ -532,16 +549,16 @@ _normalize_link_path :: proc(p: []u16, allocator: runtime.Allocator) -> (str: st
 	}
 
 	if !has_unc_prefix(p) {
-		return win32.utf16_to_utf8(p, allocator)
+		return win32_utf16_to_utf8(p, allocator)
 	}
 
 	ws := p[4:]
 	switch {
 	case len(ws) >= 2 && ws[1] == ':':
-		return win32.utf16_to_utf8(ws, allocator)
+		return win32_utf16_to_utf8(ws, allocator)
 	case has_prefix(ws, `UNC\`):
 		ws[3] = '\\' // override data in buffer
-		return win32.utf16_to_utf8(ws[3:], allocator)
+		return win32_utf16_to_utf8(ws[3:], allocator)
 	}
 
 
@@ -566,9 +583,9 @@ _normalize_link_path :: proc(p: []u16, allocator: runtime.Allocator) -> (str: st
 		ws = ws[4:]
 		if len(ws) > 3 && has_prefix(ws, `UNC`) {
 			ws[2] = '\\'
-			return win32.utf16_to_utf8(ws[2:], allocator)
+			return win32_utf16_to_utf8(ws[2:], allocator)
 		}
-		return win32.utf16_to_utf8(ws, allocator)
+		return win32_utf16_to_utf8(ws, allocator)
 	}
 	return "", .Invalid_Path
 }
@@ -579,7 +596,9 @@ _read_link :: proc(name: string, allocator: runtime.Allocator) -> (s: string, er
 	@thread_local
 	rdb_buf: [MAXIMUM_REPARSE_DATA_BUFFER_SIZE]byte
 
-	p := _fix_long_path(name)
+	TEMP_ALLOCATOR_GUARD()
+
+	p      := _fix_long_path(name, temp_allocator()) or_return
 	handle := _open_sym_link(p) or_return
 	defer win32.CloseHandle(handle)
 
@@ -599,7 +618,7 @@ _read_link :: proc(name: string, allocator: runtime.Allocator) -> (s: string, er
 		pb[rb.SubstituteNameOffset+rb.SubstituteNameLength] = 0
 		p := pb[rb.SubstituteNameOffset:][:rb.SubstituteNameLength]
 		if rb.Flags & win32.SYMLINK_FLAG_RELATIVE != 0 {
-			return win32.utf16_to_utf8(p, allocator)
+			return win32_utf16_to_utf8(p, allocator)
 		}
 		return _normalize_link_path(p, allocator)
 
@@ -616,17 +635,18 @@ _read_link :: proc(name: string, allocator: runtime.Allocator) -> (s: string, er
 
 
 _fchdir :: proc(f: ^File) -> Error {
-	if f == nil {
+	if f == nil || f.impl == nil {
 		return nil
 	}
-	if !win32.SetCurrentDirectoryW(f.impl.wname) {
+	impl := (^File_Impl)(f.impl)
+	if !win32.SetCurrentDirectoryW(impl.wname) {
 		return _get_platform_error()
 	}
 	return nil
 }
 
-_fchmod :: proc(f: ^File, mode: File_Mode) -> Error {
-	if f == nil {
+_fchmod :: proc(f: ^File, mode: int) -> Error {
+	if f == nil || f.impl == nil {
 		return nil
 	}
 	d: win32.BY_HANDLE_FILE_INFORMATION
@@ -653,14 +673,15 @@ _fchown :: proc(f: ^File, uid, gid: int) -> Error {
 }
 
 _chdir :: proc(name: string) -> Error {
-	p := _fix_long_path(name)
+	TEMP_ALLOCATOR_GUARD()
+	p := _fix_long_path(name, temp_allocator()) or_return
 	if !win32.SetCurrentDirectoryW(p) {
 		return _get_platform_error()
 	}
 	return nil
 }
 
-_chmod :: proc(name: string, mode: File_Mode) -> Error {
+_chmod :: proc(name: string, mode: int) -> Error {
 	f := open(name, {.Write}) or_return
 	defer close(f)
 	return _fchmod(f, mode)
@@ -681,7 +702,7 @@ _chtimes :: proc(name: string, atime, mtime: time.Time) -> Error {
 	return _fchtimes(f, atime, mtime)
 }
 _fchtimes :: proc(f: ^File, atime, mtime: time.Time) -> Error {
-	if f == nil {
+	if f == nil || f.impl == nil {
 		return nil
 	}
 	d: win32.BY_HANDLE_FILE_INFORMATION
@@ -708,36 +729,16 @@ _fchtimes :: proc(f: ^File, atime, mtime: time.Time) -> Error {
 	return nil
 }
 
-
-
 _exists :: proc(path: string) -> bool {
-	wpath := _fix_long_path(path)
+	TEMP_ALLOCATOR_GUARD()
+	wpath, _ := _fix_long_path(path, temp_allocator())
 	attribs := win32.GetFileAttributesW(wpath)
 	return attribs != win32.INVALID_FILE_ATTRIBUTES
 }
 
-_is_file :: proc(path: string) -> bool {
-	wpath := _fix_long_path(path)
-	attribs := win32.GetFileAttributesW(wpath)
-	if attribs != win32.INVALID_FILE_ATTRIBUTES {
-		return attribs & win32.FILE_ATTRIBUTE_DIRECTORY == 0
-	}
-	return false
-}
-
-_is_dir :: proc(path: string) -> bool {
-	wpath := _fix_long_path(path)
-	attribs := win32.GetFileAttributesW(wpath)
-	if attribs != win32.INVALID_FILE_ATTRIBUTES {
-		return attribs & win32.FILE_ATTRIBUTE_DIRECTORY != 0
-	}
-	return false
-}
-
-
 @(private="package")
 _file_stream_proc :: proc(stream_data: rawptr, mode: io.Stream_Mode, p: []byte, offset: i64, whence: io.Seek_From) -> (n: i64, err: io.Error) {
-	f := (^File)(stream_data)
+	f := (^File_Impl)(stream_data)
 	ferr: Error
 	switch mode {
 	case .Read:
@@ -778,3 +779,85 @@ _file_stream_proc :: proc(stream_data: rawptr, mode: io.Stream_Mode, p: []byte,
 	return 0, .Empty
 }
 
+
+
+@(private="package", require_results)
+win32_utf8_to_wstring :: proc(s: string, allocator: runtime.Allocator) -> (ws: [^]u16, err: runtime.Allocator_Error) {
+	ws = raw_data(win32_utf8_to_utf16(s, allocator) or_return)
+	return
+}
+
+@(private="package", require_results)
+win32_utf8_to_utf16 :: proc(s: string, allocator: runtime.Allocator) -> (ws: []u16, err: runtime.Allocator_Error) {
+	if len(s) < 1 {
+		return
+	}
+
+	b := transmute([]byte)s
+	cstr := raw_data(b)
+	n := win32.MultiByteToWideChar(win32.CP_UTF8, win32.MB_ERR_INVALID_CHARS, cstr, i32(len(s)), nil, 0)
+	if n == 0 {
+		return nil, nil
+	}
+
+	text := make([]u16, n+1, allocator) or_return
+
+	n1 := win32.MultiByteToWideChar(win32.CP_UTF8, win32.MB_ERR_INVALID_CHARS, cstr, i32(len(s)), raw_data(text), n)
+	if n1 == 0 {
+		delete(text, allocator)
+		return
+	}
+
+	text[n] = 0
+	for n >= 1 && text[n-1] == 0 {
+		n -= 1
+	}
+	ws = text[:n]
+	return
+}
+
+@(private="package", require_results)
+win32_wstring_to_utf8 :: proc(s: [^]u16, allocator: runtime.Allocator) -> (res: string, err: runtime.Allocator_Error) {
+	if s == nil || s[0] == 0 {
+		return "", nil
+	}
+	n := 0
+	for s[n] != 0 {
+		n += 1
+	}
+	return win32_utf16_to_utf8(s[:n], allocator)
+}
+
+@(private="package", require_results)
+win32_utf16_to_utf8 :: proc(s: []u16, allocator: runtime.Allocator) -> (res: string, err: runtime.Allocator_Error) {
+	if len(s) == 0 {
+		return
+	}
+
+	n := win32.WideCharToMultiByte(win32.CP_UTF8, win32.WC_ERR_INVALID_CHARS, raw_data(s), i32(len(s)), nil, 0, nil, nil)
+	if n == 0 {
+		return
+	}
+
+	// If N < 0 the call to WideCharToMultiByte assume the wide string is null terminated
+	// and will scan it to find the first null terminated character. The resulting string will
+	// also be null terminated.
+	// If N > 0 it assumes the wide string is not null terminated and the resulting string
+	// will not be null terminated.
+	text := make([]byte, n, allocator) or_return
+
+	n1 := win32.WideCharToMultiByte(win32.CP_UTF8, win32.WC_ERR_INVALID_CHARS, raw_data(s), i32(len(s)), raw_data(text), n, nil, nil)
+	if n1 == 0 {
+		delete(text, allocator)
+		return
+	}
+
+	for i in 0..<n {
+		if text[i] == 0 {
+			n = i
+			break
+		}
+	}
+	res = string(text[:n])
+	return
+}
diff --git a/core/os/os2/heap.odin b/core/os/os2/heap.odin
index e0cffaf0d..8f9c7680a 100644
--- a/core/os/os2/heap.odin
+++ b/core/os/os2/heap.odin
@@ -17,7 +17,3 @@ heap_allocator_proc :: proc(allocator_data: rawptr, mode: runtime.Allocator_Mode
                             old_memory: rawptr, old_size: int, loc := #caller_location) -> ([]byte, runtime.Allocator_Error) {
 	return _heap_allocator_proc(allocator_data, mode, size, alignment, old_memory, old_size, loc)
 }
-
-
-@(private)
-error_allocator := heap_allocator
diff --git a/core/os/os2/internal_util.odin b/core/os/os2/internal_util.odin
index e26cf7439..f7a38f3f1 100644
--- a/core/os/os2/internal_util.odin
+++ b/core/os/os2/internal_util.odin
@@ -126,3 +126,5 @@ random_string :: proc(buf: []byte) -> string {
 	buf[i] = digits[u % b]
 	return string(buf[i:])
 }
+
+
diff --git a/core/os/os2/path.odin b/core/os/os2/path.odin
index 27c3d6b0b..3bf422ccb 100644
--- a/core/os/os2/path.odin
+++ b/core/os/os2/path.odin
@@ -12,12 +12,14 @@ is_path_separator :: proc(c: byte) -> bool {
 }
 
 mkdir :: make_directory
-make_directory :: proc(name: string, perm: File_Mode) -> Error {
+
+make_directory :: proc(name: string, perm: int) -> Error {
 	return _mkdir(name, perm)
 }
 
 mkdir_all :: make_directory_all
-make_directory_all :: proc(path: string, perm: File_Mode) -> Error {
+
+make_directory_all :: proc(path: string, perm: int) -> Error {
 	return _mkdir_all(path, perm)
 }
 
@@ -25,14 +27,15 @@ remove_all :: proc(path: string) -> Error {
 	return _remove_all(path)
 }
 
-
 getwd :: get_working_directory
+
 @(require_results)
 get_working_directory :: proc(allocator: runtime.Allocator) -> (dir: string, err: Error) {
-	return _getwd(allocator)
+	return _get_working_directory(allocator)
 }
 
 setwd :: set_working_directory
+
 set_working_directory :: proc(dir: string) -> (err: Error) {
-	return _setwd(dir)
+	return _set_working_directory(dir)
 }
diff --git a/core/os/os2/path_linux.odin b/core/os/os2/path_linux.odin
index 3c08eedee..be60f9b86 100644
--- a/core/os/os2/path_linux.odin
+++ b/core/os/os2/path_linux.odin
@@ -15,19 +15,13 @@ _is_path_separator :: proc(c: byte) -> bool {
 	return c == '/'
 }
 
-_mkdir :: proc(path: string, perm: File_Mode) -> Error {
-	// TODO: These modes would require mknod, however, that would also
-	//       require additional arguments to this function..
-	if perm & (File_Mode_Named_Pipe | File_Mode_Device | File_Mode_Char_Device | File_Mode_Sym_Link) != 0 {
-		return .Invalid_Argument
-	}
-
+_mkdir :: proc(path: string, perm: int) -> Error {
 	TEMP_ALLOCATOR_GUARD()
 	path_cstr := temp_cstring(path) or_return
-	return _get_platform_error(linux.mkdir(path_cstr, transmute(linux.Mode)(u32(perm) & 0o777)))
+	return _get_platform_error(linux.mkdir(path_cstr, transmute(linux.Mode)u32(perm)))
 }
 
-_mkdir_all :: proc(path: string, perm: File_Mode) -> Error {
+_mkdir_all :: proc(path: string, perm: int) -> Error {
 	mkdirat :: proc(dfd: linux.Fd, path: []u8, perm: int, has_created: ^bool) -> Error {
 		i: int
 		for ; i < len(path) - 1 && path[i] != '/'; i += 1 {}
@@ -38,7 +32,7 @@ _mkdir_all :: proc(path: string, perm: File_Mode) -> Error {
 		new_dfd, errno := linux.openat(dfd, cstring(&path[0]), _OPENDIR_FLAGS)
 		#partial switch errno {
 		case .ENOENT:
-			if errno = linux.mkdirat(dfd, cstring(&path[0]), transmute(linux.Mode)(u32(perm))); errno != .NONE {
+			if errno = linux.mkdirat(dfd, cstring(&path[0]), transmute(linux.Mode)u32(perm)); errno != .NONE {
 				return _get_platform_error(errno)
 			}
 			has_created^ = true
@@ -53,17 +47,9 @@ _mkdir_all :: proc(path: string, perm: File_Mode) -> Error {
 			// skip consecutive '/'
 			for i += 1; i < len(path) && path[i] == '/'; i += 1 {}
 			return mkdirat(new_dfd, path[i:], perm, has_created)
-		case:
-			return _get_platform_error(errno)
 		}
-		unreachable()
+		return _get_platform_error(errno)
 	}
-
-	// TODO
-	if perm & (File_Mode_Named_Pipe | File_Mode_Device | File_Mode_Char_Device | File_Mode_Sym_Link) != 0 {
-		return .Invalid_Argument
-	}
-
 	TEMP_ALLOCATOR_GUARD()
 	// need something we can edit, and use to generate cstrings
 	path_bytes := make([]u8, len(path) + 1, temp_allocator())
@@ -85,12 +71,8 @@ _mkdir_all :: proc(path: string, perm: File_Mode) -> Error {
 	}
 	
 	has_created: bool
-	mkdirat(dfd, path_bytes, int(perm & 0o777), &has_created) or_return
-	if has_created {
-		return nil
-	}
-	return .Exist
-	//return has_created ? nil : .Exist
+	mkdirat(dfd, path_bytes, perm, &has_created) or_return
+	return nil if has_created else .Exist
 }
 
 dirent64 :: struct {
@@ -181,7 +163,7 @@ _remove_all :: proc(path: string) -> Error {
 	return _get_platform_error(linux.rmdir(path_cstr))
 }
 
-_getwd :: proc(allocator: runtime.Allocator) -> (string, Error) {
+_get_working_directory :: proc(allocator: runtime.Allocator) -> (string, Error) {
 	// NOTE(tetra): I would use PATH_MAX here, but I was not able to find
 	// an authoritative value for it across all systems.
 	// The largest value I could find was 4096, so might as well use the page size.
@@ -201,12 +183,12 @@ _getwd :: proc(allocator: runtime.Allocator) -> (string, Error) {
 	unreachable()
 }
 
-_setwd :: proc(dir: string) -> Error {
+_set_working_directory :: proc(dir: string) -> Error {
 	dir_cstr := temp_cstring(dir) or_return
 	return _get_platform_error(linux.chdir(dir_cstr))
 }
 
-_get_full_path :: proc(fd: linux.Fd, allocator: runtime.Allocator) -> string {
+_get_full_path :: proc(fd: linux.Fd, allocator: runtime.Allocator) -> (fullpath: string, err: Error) {
 	PROC_FD_PATH :: "/proc/self/fd/"
 
 	buf: [32]u8
@@ -214,10 +196,9 @@ _get_full_path :: proc(fd: linux.Fd, allocator: runtime.Allocator) -> string {
 
 	strconv.itoa(buf[len(PROC_FD_PATH):], int(fd))
 
-	fullpath: string
-	err: Error
 	if fullpath, err = _read_link_cstr(cstring(&buf[0]), allocator); err != nil || fullpath[0] != '/' {
-		return ""
+		delete(fullpath, allocator)
+		fullpath = ""
 	}
-	return fullpath
+	return
 }
diff --git a/core/os/os2/path_windows.odin b/core/os/os2/path_windows.odin
index fcd1e3321..4aa695ee2 100644
--- a/core/os/os2/path_windows.odin
+++ b/core/os/os2/path_windows.odin
@@ -12,14 +12,15 @@ _is_path_separator :: proc(c: byte) -> bool {
 	return c == '\\' || c == '/'
 }
 
-_mkdir :: proc(name: string, perm: File_Mode) -> Error {
-	if !win32.CreateDirectoryW(_fix_long_path(name), nil) {
+_mkdir :: proc(name: string, perm: int) -> Error {
+	TEMP_ALLOCATOR_GUARD()
+	if !win32.CreateDirectoryW(_fix_long_path(name, temp_allocator()) or_return, nil) {
 		return _get_platform_error()
 	}
 	return nil
 }
 
-_mkdir_all :: proc(path: string, perm: File_Mode) -> Error {
+_mkdir_all :: proc(path: string, perm: int) -> Error {
 	fix_root_directory :: proc(p: string) -> (s: string, allocated: bool, err: runtime.Allocator_Error) {
 		if len(p) == len(`\\?\c:`) {
 			if is_path_separator(p[0]) && is_path_separator(p[1]) && p[2] == '?' && is_path_separator(p[3]) && p[5] == ':' {
@@ -33,9 +34,9 @@ _mkdir_all :: proc(path: string, perm: File_Mode) -> Error {
 
 	TEMP_ALLOCATOR_GUARD()
 
-	dir, err := stat(path, temp_allocator())
+	dir_stat, err := stat(path, temp_allocator())
 	if err == nil {
-		if dir.is_directory {
+		if dir_stat.type == .Directory {
 			return nil
 		}
 		return .Exist
@@ -61,8 +62,8 @@ _mkdir_all :: proc(path: string, perm: File_Mode) -> Error {
 
 	err = mkdir(path, perm)
 	if err != nil {
-		dir1, err1 := lstat(path, temp_allocator())
-		if err1 == nil && dir1.is_directory {
+		new_dir_stat, err1 := lstat(path, temp_allocator())
+		if err1 == nil && new_dir_stat.type == .Directory {
 			return nil
 		}
 		return err
@@ -71,41 +72,114 @@ _mkdir_all :: proc(path: string, perm: File_Mode) -> Error {
 }
 
 _remove_all :: proc(path: string) -> Error {
-	// TODO(bill): _remove_all for windows
+	if path == "" {
+		return nil
+	}
+
+	err := remove(path)
+	if err == nil || err == .Not_Exist {
+		return nil
+	}
+
+	TEMP_ALLOCATOR_GUARD()
+	dir := win32_utf8_to_wstring(path, temp_allocator()) or_return
+
+	empty: [1]u16
+
+	file_op := win32.SHFILEOPSTRUCTW {
+		nil,
+		win32.FO_DELETE,
+		dir,
+		&empty[0],
+		win32.FOF_NOCONFIRMATION | win32.FOF_NOERRORUI | win32.FOF_SILENT,
+		false,
+		nil,
+		&empty[0],
+	}
+	res := win32.SHFileOperationW(&file_op)
+	if res != 0 {
+		return _get_platform_error()
+	}
 	return nil
 }
 
-_getwd :: proc(allocator: runtime.Allocator) -> (dir: string, err: Error) {
-	// TODO(bill)
-	return "", nil
+@private cwd_lock: win32.SRWLOCK // zero is initialized
+
+_get_working_directory :: proc(allocator: runtime.Allocator) -> (dir: string, err: Error) {
+	win32.AcquireSRWLockExclusive(&cwd_lock)
+
+	TEMP_ALLOCATOR_GUARD()
+
+	sz_utf16 := win32.GetCurrentDirectoryW(0, nil)
+	dir_buf_wstr := make([]u16, sz_utf16, temp_allocator()) or_return
+
+	sz_utf16 = win32.GetCurrentDirectoryW(win32.DWORD(len(dir_buf_wstr)), raw_data(dir_buf_wstr))
+	assert(int(sz_utf16)+1 == len(dir_buf_wstr)) // the second time, it _excludes_ the NUL.
+
+	win32.ReleaseSRWLockExclusive(&cwd_lock)
+
+	return win32_utf16_to_utf8(dir_buf_wstr, allocator)
 }
 
-_setwd :: proc(dir: string) -> (err: Error) {
-	// TODO(bill)
-	return nil
-}
+_set_working_directory :: proc(dir: string) -> (err: Error) {
+	TEMP_ALLOCATOR_GUARD()
+	wstr := win32_utf8_to_wstring(dir, temp_allocator()) or_return
 
+	win32.AcquireSRWLockExclusive(&cwd_lock)
+
+	if !win32.SetCurrentDirectoryW(wstr) {
+		err = _get_platform_error()
+	}
+
+	win32.ReleaseSRWLockExclusive(&cwd_lock)
+
+	return
+}
 
 can_use_long_paths: bool
 
 @(init)
 init_long_path_support :: proc() {
-	// TODO(bill): init_long_path_support
-	// ADD THIS SHIT
-	// registry_path := win32.L(`Computer\HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\FileSystem\LongPathsEnabled`)
 	can_use_long_paths = false
+
+	key: win32.HKEY
+	res := win32.RegOpenKeyExW(win32.HKEY_LOCAL_MACHINE, win32.L(`SYSTEM\CurrentControlSet\Control\FileSystem`), 0, win32.KEY_READ, &key)
+	defer win32.RegCloseKey(key)
+	if res != 0 {
+		return
+	}
+
+	value: u32
+	size := u32(size_of(value))
+	res = win32.RegGetValueW(
+		key,
+		nil,
+		win32.L("LongPathsEnabled"),
+		win32.RRF_RT_ANY,
+		nil,
+		&value,
+		&size,
+	)
+	if res != 0 {
+		return
+	}
+	if value == 1 {
+		can_use_long_paths = true
+	}
+
 }
 
-
-_fix_long_path_slice :: proc(path: string) -> []u16 {
-	return win32.utf8_to_utf16(_fix_long_path_internal(path))
+@(require_results)
+_fix_long_path_slice :: proc(path: string, allocator: runtime.Allocator) -> ([]u16, runtime.Allocator_Error) {
+	return win32_utf8_to_utf16(_fix_long_path_internal(path), allocator)
 }
 
-_fix_long_path :: proc(path: string) -> win32.wstring {
-	return win32.utf8_to_wstring(_fix_long_path_internal(path))
+@(require_results)
+_fix_long_path :: proc(path: string, allocator: runtime.Allocator) -> (win32.wstring, runtime.Allocator_Error) {
+	return win32_utf8_to_wstring(_fix_long_path_internal(path), allocator)
 }
 
-
+@(require_results)
 _fix_long_path_internal :: proc(path: string) -> string {
 	if can_use_long_paths {
 		return path
@@ -162,5 +236,4 @@ _fix_long_path_internal :: proc(path: string) -> string {
 	}
 
 	return string(path_buf[:w])
-
 }
diff --git a/core/os/os2/pipe_linux.odin b/core/os/os2/pipe_linux.odin
index 5d42cca78..c3fecfb9e 100644
--- a/core/os/os2/pipe_linux.odin
+++ b/core/os/os2/pipe_linux.odin
@@ -5,13 +5,13 @@ import "core:sys/linux"
 
 _pipe :: proc() -> (r, w: ^File, err: Error) {
 	fds: [2]linux.Fd
-	errno := linux.pipe2(&fds, {.CLOEXEC})
+	errno := linux.pipe2(&fds, {})
 	if errno != .NONE {
 		return nil, nil,_get_platform_error(errno)
 	}
 
-	r = _new_file(uintptr(fds[0]))
-	w = _new_file(uintptr(fds[1]))
+	r = _new_file(uintptr(fds[0])) or_return
+	w = _new_file(uintptr(fds[1])) or_return
 	return
 }
 
diff --git a/core/os/os2/pipe_windows.odin b/core/os/os2/pipe_windows.odin
index bab8b44f5..59615e306 100644
--- a/core/os/os2/pipe_windows.odin
+++ b/core/os/os2/pipe_windows.odin
@@ -5,7 +5,11 @@ import win32 "core:sys/windows"
 
 _pipe :: proc() -> (r, w: ^File, err: Error) {
 	p: [2]win32.HANDLE
-	if !win32.CreatePipe(&p[0], &p[1], nil, 0) {
+	sa := win32.SECURITY_ATTRIBUTES {
+		nLength = size_of(win32.SECURITY_ATTRIBUTES),
+		bInheritHandle = true,
+	}
+	if !win32.CreatePipe(&p[0], &p[1], &sa, 0) {
 		return nil, nil, _get_platform_error()
 	}
 	return new_file(uintptr(p[0]), ""), new_file(uintptr(p[1]), ""), nil
diff --git a/core/os/os2/process.odin b/core/os/os2/process.odin
index 862434b7b..3f3e64668 100644
--- a/core/os/os2/process.odin
+++ b/core/os/os2/process.odin
@@ -1,102 +1,406 @@
 package os2
 
-import "core:sync"
-import "core:time"
 import "base:runtime"
+import "core:time"
 
-args: []string
+/*
+	In procedures that explicitly state this as one of the allowed values,
+	specifies an infinite timeout.
+*/
+TIMEOUT_INFINITE :: time.MIN_DURATION // Note(flysand): Any negative duration will be treated as infinity
 
+/*
+	Arguments to the current process.
+*/
+args := get_args()
+
+@(private="file", require_results)
+get_args :: proc() -> []string {
+	result := make([]string, len(runtime.args__), heap_allocator())
+	for rt_arg, i in runtime.args__ {
+		result[i] = string(rt_arg)
+	}
+	return result
+}
+
+/*
+	Exit the current process.
+*/
 exit :: proc "contextless" (code: int) -> ! {
-	runtime.trap()
+	_exit(code)
 }
 
+/*
+	Obtain the UID of the current process.
+
+	**Note(windows)**: Windows doesn't follow the posix permissions model, so
+	the function simply returns -1.
+*/
+@(require_results)
 get_uid :: proc() -> int {
-	return -1
+	return _get_uid()
 }
 
+/*
+	Obtain the effective UID of the current process.
+
+	The effective UID is typically the same as the UID of the process. In case
+	the process was run by a user with elevated permissions, the process may
+	lower the privilege to perform some tasks without privilege. In these cases
+	the real UID of the process and the effective UID are different.
+	
+	**Note(windows)**: Windows doesn't follow the posix permissions model, so
+	the function simply returns -1.
+*/
+@(require_results)
 get_euid :: proc() -> int {
-	return -1
+	return _get_euid()
 }
 
+/*
+	Obtain the GID of the current process.
+	
+	**Note(windows)**: Windows doesn't follow the posix permissions model, so
+	the function simply returns -1.
+*/
+@(require_results)
 get_gid :: proc() -> int {
-	return -1
+	return _get_gid()
 }
 
+/*
+	Obtain the effective GID of the current process.
+	
+	The effective GID is typically the same as the GID of the process. In case
+	the process was run by a user with elevated permissions, the process may
+	lower the privilege to perform some tasks without privilege. In these cases
+	the real GID of the process and the effective GID are different.
+
+	**Note(windows)**: Windows doesn't follow the posix permissions model, so
+	the function simply returns -1.
+*/
+@(require_results)
 get_egid :: proc() -> int {
-	return -1
+	return _get_egid()
 }
 
+/*
+	Obtain the ID of the current process.
+*/
+@(require_results)
 get_pid :: proc() -> int {
-	return -1
+	return _get_pid()
 }
 
+/*
+	Obtain the ID of the parent process.
+
+	**Note(windows)**: Windows does not mantain strong relationships between
+	parent and child processes. This function returns the ID of the process
+	that has created the current process. In case the parent has died, the ID
+	returned by this function can identify a non-existent or a different
+	process.
+*/
+@(require_results)
 get_ppid :: proc() -> int {
-	return -1
+	return _get_ppid()
 }
 
+/*
+	Obtain ID's of all processes running in the system.
+*/
+@(require_results)
+process_list :: proc(allocator: runtime.Allocator) -> ([]int, Error) {
+	return _process_list(allocator)
+}
 
+/*
+	Bit set specifying which fields of the `Process_Info` struct need to be
+	obtained by the `process_info()` procedure. Each bit corresponds to a
+	field in the `Process_Info` struct.
+*/
+Process_Info_Fields :: bit_set[Process_Info_Field]
+Process_Info_Field :: enum {
+	Executable_Path,
+	PPid,
+	Priority,
+	Command_Line,
+	Command_Args,
+	Environment,
+	Username,
+	Working_Dir,
+}
+
+/*
+	Contains information about the process as obtained by the `process_info()`
+	procedure.
+*/
+Process_Info :: struct {
+	// The information about a process the struct contains. `pid` is always
+	// stored, no matter what.
+	fields: Process_Info_Fields,
+	// The ID of the process.
+	pid: int,
+	// The ID of the parent process.
+	ppid: int,
+	// The process priority.
+	priority: int,
+	// The path to the executable, which the process runs.
+	executable_path: string,
+	// The command line supplied to the process.
+	command_line: string,
+	// The arguments supplied to the process.
+	command_args: []string,
+	// The environment of the process.
+	environment: []string,
+	// The username of the user who started the process.
+	username: string,
+	// The current working directory of the process.
+	working_dir: string,
+}
+
+/*
+	Obtain information about a process.
+
+	This procedure obtains an information, specified by `selection` parameter of
+	a process given by `pid`.
+	
+	Use `free_process_info` to free the memory allocated by this procedure. In
+	case the function returns an error all temporary allocations would be freed
+	and as such, calling `free_process_info()` is not needed.
+
+	**Note**: The resulting information may or may contain the fields specified
+	by the `selection` parameter. Always check whether the returned
+	`Process_Info` struct has the required fields before checking the error code
+	returned by this function.
+*/
+@(require_results)
+process_info_by_pid :: proc(pid: int, selection: Process_Info_Fields, allocator: runtime.Allocator) -> (Process_Info, Error) {
+	return _process_info_by_pid(pid, selection, allocator)
+}
+
+/*
+	Obtain information about a process.
+
+	This procedure obtains information, specified by `selection` parameter
+	about a process that has been opened by the application, specified in
+	the `process` parameter.
+
+	Use `free_process_info` to free the memory allocated by this procedure. In
+	case the function returns an error, all temporary allocations would be freed
+	and as such, calling `free_process_info` is not needed.
+
+	**Note**: The resulting information may or may contain the fields specified
+	by the `selection` parameter. Always check whether the returned
+	`Process_Info` struct has the required fields before checking the error code
+	returned by this function.
+*/
+@(require_results)
+process_info_by_handle :: proc(process: Process, selection: Process_Info_Fields, allocator: runtime.Allocator) -> (Process_Info, Error) {
+	return _process_info_by_handle(process, selection, allocator)
+}
+
+/*
+	Obtain information about the current process.
+
+	This procedure obtains the information, specified by `selection` parameter
+	about the currently running process.
+
+	Use `free_process_info` to free the memory allocated by this function. In
+	case this function returns an error, all temporary allocations would be
+	freed and as such calling `free_process_info()` is not needed.
+
+	**Note**: The resulting information may or may contain the fields specified
+	by the `selection` parameter. Always check whether the returned
+	`Process_Info` struct has the required fields before checking the error code
+	returned by this function.
+*/
+@(require_results)
+current_process_info :: proc(selection: Process_Info_Fields, allocator: runtime.Allocator) -> (Process_Info, Error) {
+	return _current_process_info(selection, allocator)
+}
+
+/*
+	Obtain information about the specified process.
+*/
+process_info :: proc {
+	process_info_by_pid,
+	process_info_by_handle,
+	current_process_info,
+}
+
+/*
+	Free the information about the process.
+
+	This procedure frees the memory occupied by process info using the provided
+	allocator. The allocator needs to be the same allocator that was supplied
+	to the `process_info` function.
+*/
+free_process_info :: proc(pi: Process_Info, allocator: runtime.Allocator) {
+	delete(pi.executable_path, allocator)
+	delete(pi.command_line, allocator)
+	delete(pi.command_args, allocator)
+	for s in pi.environment {
+		delete(s, allocator)
+	}
+	delete(pi.environment, allocator)
+	delete(pi.working_dir, allocator)
+}
+
+/*
+	Represents a process handle.
+
+	When a process dies, the OS is free to re-use the pid of that process. The
+	`Process` struct represents a handle to the process that will refer to a
+	specific process, even after it has died.
+
+	**Note(linux)**: The `handle` will be referring to pidfd.
+*/
 Process :: struct {
-	pid:          int,
-	handle:       uintptr,
-	is_done:      b32,
-	signal_mutex: sync.RW_Mutex,
+	pid: int,
+	handle: uintptr,
 }
 
+Process_Open_Flags :: bit_set[Process_Open_Flag]
+Process_Open_Flag :: enum {
+	// Request for reading from the virtual memory of another process.
+	Mem_Read,
+	// Request for writing to the virtual memory of another process.
+	Mem_Write,
+}
 
-Process_Attributes :: struct {
-	dir: string,
+/*
+	Open a process handle using it's pid.
+
+	This procedure obtains a process handle of a process specified by `pid`.
+	This procedure can be subject to race conditions. See the description of
+	`Process`.
+
+	Use `process_close()` function to close the process handle.
+*/
+@(require_results)
+process_open :: proc(pid: int, flags := Process_Open_Flags {}) -> (Process, Error) {
+	return _process_open(pid, flags)
+}
+
+/*
+	The description of how a process should be created.
+*/
+Process_Desc :: struct {
+	// OS-specific attributes.
+	sys_attr: _Sys_Process_Attributes,
+	// The working directory of the process. If the string has length 0, the
+	// working directory is assumed to be the current working directory of the
+	// current process.
+	working_dir: string,
+	// The command to run. Each element of the slice is a separate argument to
+	// the process. The first element of the slice would be the executable.
+	command: []string,
+	// A slice of strings, each having the format `KEY=VALUE` representing the
+	// full environment that the child process will receive.
+	// In case this slice is `nil`, the current process' environment is used.
 	env: []string,
-	files: []^File,
-	sys: ^Process_Attributes_OS_Specific,
+	// The `stderr` handle to give to the child process. It can be either a file
+	// or a writeable end of a pipe. Passing `nil` will shut down the process'
+	// stderr output.
+	stderr: ^File,
+	// The `stdout` handle to give to the child process. It can be either a file
+	// or a writeabe end of a pipe. Passing a `nil` will shut down the process'
+	// stdout output.
+	stdout: ^File,
+	// The `stdin` handle to give to the child process. It can either be a file
+	// or a readable end of a pipe. Passing a `nil` will shut down the process'
+	// input.
+	stdin: ^File,
 }
 
-Process_Attributes_OS_Specific :: struct{}
+/*
+	Create a new process and obtain its handle.
 
-Process_Error :: enum {
-	None,
+	This procedure creates a new process, with a given command and environment
+	strings as parameters. Use `environ()` to inherit the environment of the
+	current process.
+
+	The `desc` parameter specifies the description of how the process should
+	be created. It contains information such as the command line, the
+	environment of the process, the starting directory and many other options.
+	Most of the fields in the struct can be set to `nil` or an empty value.
+	
+	Use `process_close` to close the handle to the process. Note, that this
+	is not the same as terminating the process. One can terminate the process
+	and not close the handle, in which case the handle would be leaked. In case
+	the function returns an error, an invalid handle is returned.
+
+	This procedure is not thread-safe. It may alter the inheritance properties
+	of file handles in an unpredictable manner. In case multiple threads change
+	handle inheritance properties, make sure to serialize all those calls.
+*/
+@(require_results)
+process_start :: proc(desc := Process_Desc {}) -> (Process, Error) {
+	return _process_start(desc)
 }
 
+/*
+	The state of the process after it has finished execution.
+*/
 Process_State :: struct {
-	pid:         int,
-	exit_code:   int,
-	exited:      bool,
-	success:     bool,
+	// The ID of the process.
+	pid: int,
+	// Specifies whether the process has terminated or is still running.
+	exited: bool,
+	// The exit code of the process, if it has exited.
+	// Will also store the number of the exception or signal that has crashed the
+	// process.
+	exit_code: int,
+	// Specifies whether the termination of the process was successfull or not,
+	// i.e. whether it has crashed or not.
+	// **Note(windows)**: On windows `true` is always returned, as there is no
+	// reliable way to obtain information about whether the process has crashed.
+	success: bool,
+	// The time the process has spend executing in kernel time.
 	system_time: time.Duration,
-	user_time:   time.Duration,
-	sys:         rawptr,
+	// The time the process has spend executing in userspace.
+	user_time: time.Duration,
 }
 
-Signal :: #type proc()
+/*
+	Wait for a process event.
 
-Kill:      Signal = nil
-Interrupt: Signal = nil
+	This procedure blocks the execution until the process has exited or the
+	timeout (if specified) has reached zero. If the timeout is `TIMEOUT_INFINITE`,
+	no timeout restriction is imposed and the procedure can block indefinately.
 
+	If the timeout has expired, the `General_Error.Timeout` is returned as
+	the error.
 
-find_process :: proc(pid: int) -> (^Process, Process_Error) {
-	return nil, .None
+	If an error is returned for any other reason, other than timeout, the
+	process state is considered undetermined.
+*/
+@(require_results)
+process_wait :: proc(process: Process, timeout := TIMEOUT_INFINITE) -> (Process_State, Error) {
+	return _process_wait(process, timeout)
 }
 
+/*
+	Close the handle to a process.
 
-process_start :: proc(name: string, argv: []string, attr: ^Process_Attributes) -> (^Process, Process_Error) {
-	return nil, .None
+	This procedure closes the handle associated with a process. It **does not**
+	terminate a process, in case it was running. In case a termination is
+	desired, kill the process first, wait for the process to finish,
+	then close the handle.
+*/
+@(require_results)
+process_close :: proc(process: Process) -> (Error) {
+	return _process_close(process)
 }
 
-process_release :: proc(p: ^Process) -> Process_Error {
-	return .None
+/*
+	Terminate a process.
+
+	This procedure terminates a process, specified by it's handle, `process`.
+
+*/
+@(require_results)
+process_kill :: proc(process: Process) -> (Error) {
+	return _process_kill(process)
 }
-
-process_kill :: proc(p: ^Process) -> Process_Error {
-	return .None
-}
-
-process_signal :: proc(p: ^Process, sig: Signal) -> Process_Error {
-	return .None
-}
-
-process_wait :: proc(p: ^Process) -> (Process_State, Process_Error) {
-	return {}, .None
-}
-
-
-
-
diff --git a/core/os/os2/process_linux.odin b/core/os/os2/process_linux.odin
new file mode 100644
index 000000000..d832083b6
--- /dev/null
+++ b/core/os/os2/process_linux.odin
@@ -0,0 +1,95 @@
+//+private file
+package os2
+
+import "base:runtime"
+import "core:time"
+import "core:sys/linux"
+
+@(private="package")
+_exit :: proc "contextless" (code: int) -> ! {
+	linux.exit(i32(code))
+}
+
+
+@(private="package")
+_get_uid :: proc() -> int {
+	return -1
+}
+
+@(private="package")
+_get_euid :: proc() -> int {
+	return -1
+}
+
+@(private="package")
+_get_gid :: proc() -> int {
+	return -1
+}
+
+@(private="package")
+_get_egid :: proc() -> int {
+	return -1
+}
+
+@(private="package")
+_get_pid :: proc() -> int {
+	return -1
+}
+
+@(private="package")
+_get_ppid :: proc() -> int {
+	return -1
+}
+
+@(private="package")
+_process_list :: proc(allocator: runtime.Allocator) -> (list: []int, err: Error) {
+	return
+}
+
+@(private="package")
+_process_info_by_pid :: proc(pid: int, selection: Process_Info_Fields, allocator: runtime.Allocator) -> (info: Process_Info, err: Error) {
+	return
+}
+
+@(private="package")
+_process_info_by_handle :: proc(process: Process, selection: Process_Info_Fields, allocator: runtime.Allocator) -> (info: Process_Info, err: Error) {
+	return
+}
+
+@(private="package")
+_current_process_info :: proc(selection: Process_Info_Fields, allocator: runtime.Allocator) -> (info: Process_Info, err: Error) {
+	return
+}
+
+@(private="package")
+_process_open :: proc(pid: int, flags: Process_Open_Flags) -> (process: Process, err: Error) {
+	return
+}
+
+@(private="package")
+_Sys_Process_Attributes :: struct {}
+
+@(private="package")
+_process_start :: proc(desc: Process_Desc) -> (process: Process, err: Error) {
+	return
+}
+
+@(private="package")
+_process_wait :: proc(process: Process, timeout: time.Duration) -> (process_state: Process_State, err: Error) {
+	return
+}
+
+@(private="package")
+_process_close :: proc(process: Process) -> Error {
+	return nil
+}
+
+@(private="package")
+_process_kill :: proc(process: Process) -> Error {
+	return nil
+}
+
+@(private="package")
+_process_exe_by_pid :: proc(pid: int, allocator: runtime.Allocator) -> (exe_path: string, err: Error) {
+	return
+}
\ No newline at end of file
diff --git a/core/os/os2/process_windows.odin b/core/os/os2/process_windows.odin
new file mode 100644
index 000000000..47fd62401
--- /dev/null
+++ b/core/os/os2/process_windows.odin
@@ -0,0 +1,695 @@
+//+private file
+package os2
+
+import "base:runtime"
+
+import "core:strings"
+import win32 "core:sys/windows"
+import "core:time"
+
+@(private="package")
+_exit :: proc "contextless" (code: int) -> ! {
+	win32.ExitProcess(u32(code))
+}
+
+@(private="package")
+_get_uid :: proc() -> int {
+	return -1
+}
+
+@(private="package")
+_get_euid :: proc() -> int {
+	return -1
+}
+
+@(private="package")
+_get_gid :: proc() -> int {
+	return -1
+}
+
+@(private="package")
+_get_egid :: proc() -> int {
+	return -1
+}
+
+@(private="package")
+_get_pid :: proc() -> int {
+	return int(win32.GetCurrentProcessId())
+}
+
+@(private="package")
+_get_ppid :: proc() -> int {
+	our_pid := win32.GetCurrentProcessId()
+	snap := win32.CreateToolhelp32Snapshot(win32.TH32CS_SNAPPROCESS, 0)
+	if snap == win32.INVALID_HANDLE_VALUE {
+		return -1
+	}
+	defer win32.CloseHandle(snap)
+	entry := win32.PROCESSENTRY32W { dwSize = size_of(win32.PROCESSENTRY32W) }
+	for status := win32.Process32FirstW(snap, &entry); status; /**/ {
+		if entry.th32ProcessID == our_pid {
+			return int(entry.th32ParentProcessID)
+		}
+		status = win32.Process32NextW(snap, &entry)
+	}
+	return -1
+}
+
+@(private="package")
+_process_list :: proc(allocator: runtime.Allocator) -> (list: []int, err: Error) {
+	snap := win32.CreateToolhelp32Snapshot(win32.TH32CS_SNAPPROCESS, 0)
+	if snap == win32.INVALID_HANDLE_VALUE {
+		err = _get_platform_error()
+		return
+	}
+
+	list_d := make([dynamic]int, allocator) or_return
+
+	entry := win32.PROCESSENTRY32W{dwSize = size_of(win32.PROCESSENTRY32W)}
+	status := win32.Process32FirstW(snap, &entry)
+	for status {
+		append(&list_d, int(entry.th32ProcessID))
+		status = win32.Process32NextW(snap, &entry)
+	}
+	list = list_d[:]
+	return
+}
+
+@(require_results)
+read_memory_as_struct :: proc(h: win32.HANDLE, addr: rawptr, dest: ^$T) -> (bytes_read: uint, err: Error) {
+	if !win32.ReadProcessMemory(h, addr, dest, size_of(T), &bytes_read) {
+		err = _get_platform_error()
+	}
+	return
+}
+@(require_results)
+read_memory_as_slice :: proc(h: win32.HANDLE, addr: rawptr, dest: []$T) -> (bytes_read: uint, err: Error) {
+	if !win32.ReadProcessMemory(h, addr, raw_data(dest), len(dest)*size_of(T), &bytes_read) {
+		err = _get_platform_error()
+	}
+	return
+}
+
+@(private="package")
+_process_info_by_pid :: proc(pid: int, selection: Process_Info_Fields, allocator: runtime.Allocator) -> (info: Process_Info, err: Error) {
+	info.pid = pid
+	defer if err != nil {
+		free_process_info(info, allocator)
+	}
+
+	// Data obtained from process snapshots
+	if selection >= {.PPid, .Priority} {
+		entry, entry_err := _process_entry_by_pid(info.pid)
+		if entry_err != nil {
+			err = General_Error.Not_Exist
+			return
+		}
+		if .PPid in selection {
+			info.fields += {.PPid}
+			info.ppid = int(entry.th32ParentProcessID)
+		}
+		if .Priority in selection {
+			info.fields += {.Priority}
+			info.priority = int(entry.pcPriClassBase)
+		}
+	}
+	if .Executable_Path in selection { // snap module
+		info.executable_path = _process_exe_by_pid(pid, allocator) or_return
+		info.fields += {.Executable_Path}
+	}
+
+	ph := win32.INVALID_HANDLE_VALUE
+
+	if selection >= {.Command_Line, .Environment, .Working_Dir, .Username} { // need process handle
+		ph = win32.OpenProcess(
+			win32.PROCESS_QUERY_LIMITED_INFORMATION | win32.PROCESS_VM_READ,
+			false,
+			u32(pid),
+		)
+		if ph == win32.INVALID_HANDLE_VALUE {
+			err = _get_platform_error()
+			return
+		}
+	}
+	defer if ph != win32.INVALID_HANDLE_VALUE {
+		win32.CloseHandle(ph)
+	}
+
+	if selection >= {.Command_Line, .Environment, .Working_Dir} { // need peb
+		process_info_size: u32
+		process_info: win32.PROCESS_BASIC_INFORMATION
+		status := win32.NtQueryInformationProcess(ph, .ProcessBasicInformation, &process_info, size_of(process_info), &process_info_size)
+		if status != 0 {
+			// TODO(flysand): There's probably a mismatch between NTSTATUS and
+			// windows userland error codes, I haven't checked.
+			err = Platform_Error(status)
+			return
+		}
+		if process_info.PebBaseAddress == nil {
+			// Not sure what the error is
+			err = General_Error.Unsupported
+			return
+		}
+		process_peb: win32.PEB
+
+		_ = read_memory_as_struct(ph, process_info.PebBaseAddress, &process_peb) or_return
+
+		process_params: win32.RTL_USER_PROCESS_PARAMETERS
+		_ = read_memory_as_struct(ph, process_peb.ProcessParameters, &process_params) or_return
+
+		if selection >= {.Command_Line, .Command_Args} {
+			TEMP_ALLOCATOR_GUARD()
+			cmdline_w := make([]u16, process_params.CommandLine.Length, temp_allocator()) or_return
+			_ = read_memory_as_slice(ph, process_params.CommandLine.Buffer, cmdline_w) or_return
+
+			if .Command_Line in selection {
+				info.command_line = win32_utf16_to_utf8(cmdline_w, allocator) or_return
+				info.fields += {.Command_Line}
+			}
+			if .Command_Args in selection {
+				info.command_args = _parse_command_line(raw_data(cmdline_w), allocator) or_return
+				info.fields += {.Command_Args}
+			}
+		}
+		if .Environment in selection {
+			TEMP_ALLOCATOR_GUARD()
+			env_len := process_params.EnvironmentSize / 2
+			envs_w := make([]u16, env_len, temp_allocator()) or_return
+			_ = read_memory_as_slice(ph, process_params.Environment, envs_w) or_return
+
+			info.environment = _parse_environment_block(raw_data(envs_w), allocator) or_return
+			info.fields += {.Environment}
+		}
+		if .Working_Dir in selection {
+			TEMP_ALLOCATOR_GUARD()
+			cwd_w := make([]u16, process_params.CurrentDirectoryPath.Length, temp_allocator()) or_return
+			_ = read_memory_as_slice(ph, process_params.CurrentDirectoryPath.Buffer, cwd_w) or_return
+
+			info.working_dir = win32_utf16_to_utf8(cwd_w, allocator) or_return
+			info.fields += {.Working_Dir}
+		}
+	}
+
+	if .Username in selection {
+		info.username = _get_process_user(ph, allocator) or_return
+		info.fields += {.Username}
+	}
+	err = nil
+	return
+}
+
+@(private="package")
+_process_info_by_handle :: proc(process: Process, selection: Process_Info_Fields, allocator: runtime.Allocator) -> (info: Process_Info, err: Error) {
+	pid := process.pid
+	info.pid = pid
+	defer if err != nil {
+		free_process_info(info, allocator)
+	}
+
+	// Data obtained from process snapshots
+	if selection >= {.PPid, .Priority} { // snap process
+		entry, entry_err := _process_entry_by_pid(info.pid)
+		if entry_err != nil {
+			err = General_Error.Not_Exist
+			return
+		}
+		if .PPid in selection {
+			info.fields += {.PPid}
+			info.ppid = int(entry.th32ParentProcessID)
+		}
+		if .Priority in selection {
+			info.fields += {.Priority}
+			info.priority = int(entry.pcPriClassBase)
+		}
+	}
+	if .Executable_Path in selection { // snap module
+		info.executable_path = _process_exe_by_pid(pid, allocator) or_return
+		info.fields += {.Executable_Path}
+	}
+	ph := win32.HANDLE(process.handle)
+	if selection >= {.Command_Line, .Environment, .Working_Dir} { // need peb
+		process_info_size: u32
+		process_info: win32.PROCESS_BASIC_INFORMATION
+		status := win32.NtQueryInformationProcess(ph, .ProcessBasicInformation, &process_info, size_of(process_info), &process_info_size)
+		if status != 0 {
+			// TODO(flysand): There's probably a mismatch between NTSTATUS and
+			// windows userland error codes, I haven't checked.
+			err = Platform_Error(status)
+			return
+		}
+		if process_info.PebBaseAddress == nil {
+			// Not sure what the error is
+			err = General_Error.Unsupported
+			return
+		}
+
+		process_peb: win32.PEB
+		_ = read_memory_as_struct(ph, process_info.PebBaseAddress, &process_peb) or_return
+
+		process_params: win32.RTL_USER_PROCESS_PARAMETERS
+		_ = read_memory_as_struct(ph, process_peb.ProcessParameters, &process_params) or_return
+
+		if selection >= {.Command_Line, .Command_Args} {
+			TEMP_ALLOCATOR_GUARD()
+			cmdline_w := make([]u16, process_params.CommandLine.Length, temp_allocator()) or_return
+			_ = read_memory_as_slice(ph, process_params.CommandLine.Buffer, cmdline_w) or_return
+
+			if .Command_Line in selection {
+				info.command_line = win32_utf16_to_utf8(cmdline_w, allocator) or_return
+				info.fields += {.Command_Line}
+			}
+			if .Command_Args in selection {
+				info.command_args = _parse_command_line(raw_data(cmdline_w), allocator) or_return
+				info.fields += {.Command_Args}
+			}
+		}
+
+		if .Environment in selection {
+			TEMP_ALLOCATOR_GUARD()
+			env_len := process_params.EnvironmentSize / 2
+			envs_w := make([]u16, env_len, temp_allocator()) or_return
+			_ = read_memory_as_slice(ph, process_params.Environment, envs_w) or_return
+
+			info.environment =  _parse_environment_block(raw_data(envs_w), allocator) or_return
+			info.fields += {.Environment}
+		}
+
+		if .Working_Dir in selection {
+			TEMP_ALLOCATOR_GUARD()
+			cwd_w := make([]u16, process_params.CurrentDirectoryPath.Length, temp_allocator()) or_return
+			_ = read_memory_as_slice(ph, process_params.CurrentDirectoryPath.Buffer, cwd_w) or_return
+
+			info.working_dir = win32_utf16_to_utf8(cwd_w, allocator) or_return
+			info.fields += {.Working_Dir}
+		}
+	}
+	if .Username in selection {
+		info.username = _get_process_user(ph, allocator) or_return
+		info.fields += {.Username}
+	}
+	err = nil
+	return
+}
+
+@(private="package")
+_current_process_info :: proc(selection: Process_Info_Fields, allocator: runtime.Allocator) -> (info: Process_Info, err: Error) {
+	info.pid = get_pid()
+	defer if err != nil {
+		free_process_info(info, allocator)
+	}
+
+	if selection >= {.PPid, .Priority} { // snap process
+		entry, entry_err := _process_entry_by_pid(info.pid)
+		if entry_err != nil {
+			err = General_Error.Not_Exist
+			return
+		}
+		if .PPid in selection {
+			info.fields += {.PPid}
+			info.ppid = int(entry.th32ProcessID)
+		}
+		if .Priority in selection {
+			info.fields += {.Priority}
+			info.priority = int(entry.pcPriClassBase)
+		}
+	}
+	if .Executable_Path in selection {
+		exe_filename_w: [256]u16
+		path_len := win32.GetModuleFileNameW(nil, raw_data(exe_filename_w[:]), len(exe_filename_w))
+		info.executable_path = win32_utf16_to_utf8(exe_filename_w[:path_len], allocator) or_return
+		info.fields += {.Executable_Path}
+	}
+	if selection >= {.Command_Line,  .Command_Args} {
+		command_line_w := win32.GetCommandLineW()
+		if .Command_Line in selection {
+			info.command_line = win32_wstring_to_utf8(command_line_w, allocator) or_return
+			info.fields += {.Command_Line}
+		}
+		if .Command_Args in selection {
+			info.command_args = _parse_command_line(command_line_w, allocator) or_return
+			info.fields += {.Command_Args}
+		}
+	}
+	if .Environment in selection {
+		env_block := win32.GetEnvironmentStringsW()
+		info.environment = _parse_environment_block(env_block, allocator) or_return
+		info.fields += {.Environment}
+	}
+	if .Username in selection {
+		process_handle := win32.GetCurrentProcess()
+		info.username = _get_process_user(process_handle, allocator) or_return
+		info.fields += {.Username}
+	}
+	if .Working_Dir in selection {
+		// TODO(flysand): Implement this by reading PEB
+		err = .Mode_Not_Implemented
+		return
+	}
+	err = nil
+	return
+}
+
+@(private="package")
+_process_open :: proc(pid: int, flags: Process_Open_Flags) -> (process: Process, err: Error) {
+	// Note(flysand): The handle will be used for querying information so we
+	// take the necessary permissions right away.
+	dwDesiredAccess := win32.PROCESS_QUERY_LIMITED_INFORMATION | win32.SYNCHRONIZE
+	if .Mem_Read in flags {
+		dwDesiredAccess |= win32.PROCESS_VM_READ
+	}
+	if .Mem_Write in flags {
+		dwDesiredAccess |= win32.PROCESS_VM_WRITE
+	}
+	handle := win32.OpenProcess(
+		dwDesiredAccess,
+		false,
+		u32(pid),
+	)
+	if handle == win32.INVALID_HANDLE_VALUE {
+		err = _get_platform_error()
+	} else {
+		process = {pid = pid, handle = uintptr(handle)}
+	}
+	return
+}
+
+@(private="package")
+_Sys_Process_Attributes :: struct {}
+
+@(private="package")
+_process_start :: proc(desc: Process_Desc) -> (process: Process, err: Error) {
+	TEMP_ALLOCATOR_GUARD()
+	command_line   := _build_command_line(desc.command, temp_allocator())
+	command_line_w := win32_utf8_to_wstring(command_line, temp_allocator()) or_return
+	environment := desc.env
+	if desc.env == nil {
+		environment = environ(temp_allocator())
+	}
+	environment_block   := _build_environment_block(environment, temp_allocator())
+	environment_block_w := win32_utf8_to_utf16(environment_block, temp_allocator()) or_return
+	stderr_handle       := win32.GetStdHandle(win32.STD_ERROR_HANDLE)
+	stdout_handle       := win32.GetStdHandle(win32.STD_OUTPUT_HANDLE)
+	stdin_handle        := win32.GetStdHandle(win32.STD_INPUT_HANDLE)
+
+	if desc.stdout != nil {
+		stdout_handle = win32.HANDLE((^File_Impl)(desc.stdout.impl).fd)
+	}
+	if desc.stderr != nil {
+		stderr_handle = win32.HANDLE((^File_Impl)(desc.stderr.impl).fd)
+	}
+	if desc.stdin != nil {
+		stdin_handle = win32.HANDLE((^File_Impl)(desc.stderr.impl).fd)
+	}
+
+	working_dir_w := (win32_utf8_to_wstring(desc.working_dir, temp_allocator()) or_else nil) if len(desc.working_dir) > 0 else nil
+	process_info: win32.PROCESS_INFORMATION
+	ok := win32.CreateProcessW(
+		nil,
+		command_line_w,
+		nil,
+		nil,
+		true,
+		win32.CREATE_UNICODE_ENVIRONMENT|win32.NORMAL_PRIORITY_CLASS,
+		raw_data(environment_block_w),
+		working_dir_w,
+		&win32.STARTUPINFOW{
+			cb = size_of(win32.STARTUPINFOW),
+			hStdError  = stderr_handle,
+			hStdOutput = stdout_handle,
+			hStdInput  = stdin_handle,
+			dwFlags = win32.STARTF_USESTDHANDLES,
+		},
+		&process_info,
+	)
+	if !ok {
+		err = _get_platform_error()
+		return
+	}
+	process = {pid = int(process_info.dwProcessId), handle = uintptr(process_info.hProcess)}
+	return
+}
+
+@(private="package")
+_process_wait :: proc(process: Process, timeout: time.Duration) -> (process_state: Process_State, err: Error) {
+	handle := win32.HANDLE(process.handle)
+	timeout_ms := u32(timeout / time.Millisecond) if timeout >= 0 else win32.INFINITE
+
+	switch win32.WaitForSingleObject(handle, timeout_ms) {
+	case win32.WAIT_OBJECT_0:
+		exit_code: u32
+		if !win32.GetExitCodeProcess(handle, &exit_code) {
+			err =_get_platform_error()
+			return
+		}
+		time_created: win32.FILETIME
+		time_exited: win32.FILETIME
+		time_kernel: win32.FILETIME
+		time_user: win32.FILETIME
+		if !win32.GetProcessTimes(handle, &time_created, &time_exited, &time_kernel, &time_user) {
+			err = _get_platform_error()
+			return
+		}
+		process_state = {
+			exit_code   = int(exit_code),
+			exited      = true,
+			pid         = process.pid,
+			success     = true,
+			system_time = _filetime_to_duration(time_kernel),
+			user_time   = _filetime_to_duration(time_user),
+		}
+		return
+	case win32.WAIT_TIMEOUT:
+		err = General_Error.Timeout
+		return
+	case:
+		err = _get_platform_error()
+		return
+	}
+}
+
+@(private="package")
+_process_close :: proc(process: Process) -> Error {
+	if !win32.CloseHandle(win32.HANDLE(process.handle)) {
+		return _get_platform_error()
+	}
+	return nil
+}
+
+@(private="package")
+_process_kill :: proc(process: Process) -> Error {
+	// Note(flysand): This is different than what the task manager's "kill process"
+	// functionality does, as we don't try to send WM_CLOSE message first. This
+	// is quite a rough way to kill the process, which should be consistent with
+	// linux. The error code 9 is to mimic SIGKILL event.
+	if !win32.TerminateProcess(win32.HANDLE(process.handle), 9) {
+		return _get_platform_error()
+	}
+	return nil
+}
+
+_filetime_to_duration :: proc(filetime: win32.FILETIME) -> time.Duration {
+	ticks := u64(filetime.dwHighDateTime)<<32 | u64(filetime.dwLowDateTime)
+	return time.Duration(ticks * 100)
+}
+
+_process_entry_by_pid :: proc(pid: int) -> (entry: win32.PROCESSENTRY32W, err: Error) {
+	snap := win32.CreateToolhelp32Snapshot(win32.TH32CS_SNAPPROCESS, 0)
+	if snap == win32.INVALID_HANDLE_VALUE {
+		err = _get_platform_error()
+		return
+	}
+	defer win32.CloseHandle(snap)
+
+	entry = win32.PROCESSENTRY32W{dwSize = size_of(win32.PROCESSENTRY32W)}
+	status := win32.Process32FirstW(snap, &entry)
+	for status {
+		if u32(pid) == entry.th32ProcessID {
+			return
+		}
+		status = win32.Process32NextW(snap, &entry)
+	}
+	err = General_Error.Not_Exist
+	return
+}
+
+// Note(flysand): Not sure which way it's better to get the executable path:
+// via toolhelp snapshots or by reading other process' PEB memory. I have
+// a slight suspicion that if both exe path and command line are desired,
+// it's faster to just read both from PEB, but maybe the toolhelp snapshots
+// are just better...?
+@(private="package")
+_process_exe_by_pid :: proc(pid: int, allocator: runtime.Allocator) -> (exe_path: string, err: Error) {
+	snap := win32.CreateToolhelp32Snapshot(
+		win32.TH32CS_SNAPMODULE|win32.TH32CS_SNAPMODULE32,
+		u32(pid),
+	)
+	if snap == win32.INVALID_HANDLE_VALUE {
+		err =_get_platform_error()
+		return
+	}
+	defer win32.CloseHandle(snap)
+
+	entry := win32.MODULEENTRY32W { dwSize = size_of(win32.MODULEENTRY32W) }
+	status := win32.Module32FirstW(snap, &entry)
+	if !status {
+		err =_get_platform_error()
+		return
+	}
+	return win32_wstring_to_utf8(raw_data(entry.szExePath[:]), allocator)
+}
+
+_get_process_user :: proc(process_handle: win32.HANDLE, allocator: runtime.Allocator) -> (full_username: string, err: Error) {
+	TEMP_ALLOCATOR_GUARD()
+	token_handle: win32.HANDLE
+	if !win32.OpenProcessToken(process_handle, win32.TOKEN_QUERY, &token_handle) {
+		err = _get_platform_error()
+		return
+	}
+	token_user_size: u32
+	if !win32.GetTokenInformation(token_handle, .TokenUser, nil, 0, &token_user_size) {
+		// Note(flysand): Make sure the buffer too small error comes out, and not any other error
+		err = _get_platform_error()
+		if v, ok := is_platform_error(err); !ok || v != i32(win32.ERROR_INSUFFICIENT_BUFFER) {
+			return
+		}
+		err = nil
+	}
+	token_user := (^win32.TOKEN_USER)(raw_data(make([]u8, token_user_size, temp_allocator()) or_return))
+	if !win32.GetTokenInformation(token_handle, .TokenUser, token_user, token_user_size, &token_user_size) {
+		err = _get_platform_error()
+		return
+	}
+
+	sid_type: win32.SID_NAME_USE
+	username_w: [256]u16
+	domain_w:   [256]u16
+	username_chrs := u32(256)
+	domain_chrs   := u32(256)
+
+	if !win32.LookupAccountSidW(nil, token_user.User.Sid, &username_w[0], &username_chrs, &domain_w[0], &domain_chrs, &sid_type) {
+		err = _get_platform_error()
+		return
+	}
+	username := win32_utf16_to_utf8(username_w[:username_chrs], temp_allocator()) or_return
+	domain   := win32_utf16_to_utf8(domain_w[:domain_chrs], temp_allocator()) or_return
+	return strings.concatenate({domain, "\\", username}, allocator)
+}
+
+_parse_command_line :: proc(cmd_line_w: [^]u16, allocator: runtime.Allocator) -> (argv: []string, err: Error) {
+	argc: i32
+	argv_w := win32.CommandLineToArgvW(cmd_line_w, &argc)
+	if argv_w == nil {
+		return nil, _get_platform_error()
+	}
+	argv = make([]string, argc, allocator) or_return
+	defer if err != nil {
+		for arg in argv {
+			delete(arg, allocator)
+		}
+		delete(argv, allocator)
+	}
+	for arg_w, i in argv_w[:argc] {
+		argv[i] = win32_wstring_to_utf8(arg_w, allocator) or_return
+	}
+	return
+}
+
+_build_command_line :: proc(command: []string, allocator: runtime.Allocator) -> string {
+	_write_byte_n_times :: #force_inline proc(builder: ^strings.Builder, b: byte, n: int) {
+		for _ in 0 ..< n {
+			strings.write_byte(builder, b)
+		}
+	}
+	builder := strings.builder_make(allocator)
+	for arg, i in command {
+		if i != 0 {
+			strings.write_byte(&builder, ' ')
+		}
+		j := 0
+		strings.write_byte(&builder, '"')
+		for j < len(arg) {
+			backslashes := 0
+			for j < len(arg) && arg[j] == '\\' {
+				backslashes += 1
+				j += 1
+			}
+			if j == len(arg) {
+				_write_byte_n_times(&builder, '\\', 2*backslashes)
+				break
+			} else if arg[j] == '"' {
+				_write_byte_n_times(&builder, '\\', 2*backslashes+1)
+				strings.write_byte(&builder, '"')
+			} else {
+				_write_byte_n_times(&builder, '\\', backslashes)
+				strings.write_byte(&builder, arg[j])
+			}
+			j += 1
+		}
+		strings.write_byte(&builder, '"')
+	}
+	return strings.to_string(builder)
+}
+
+_parse_environment_block :: proc(block: [^]u16, allocator: runtime.Allocator) -> (envs: []string, err: Error) {
+	zt_count := 0
+	for idx := 0; true; {
+		if block[idx] == 0x0000 {
+			zt_count += 1
+			if block[idx+1] == 0x0000 {
+				zt_count += 1
+				break
+			}
+		}
+		idx += 1
+	}
+
+	// Note(flysand): Each string in the environment block is terminated
+	// by a NUL character. In addition, the environment block itself is
+	// terminated by a NUL character. So the number of strings in the
+	// environment block is the number of NUL character minus the
+	// block terminator.
+	env_count := zt_count - 1
+	envs = make([]string, env_count, allocator) or_return
+	defer if err != nil {
+		for env in envs {
+			delete(env, allocator)
+		}
+		delete(envs, allocator)
+	}
+
+	env_idx := 0
+	last_idx := 0
+	idx := 0
+	for block[idx] != 0x0000 {
+		for block[idx] != 0x0000 {
+			idx += 1
+		}
+		env_w := block[last_idx:idx]
+		envs[env_idx] = win32_utf16_to_utf8(env_w, allocator) or_return
+		env_idx += 1
+		idx += 1
+		last_idx = idx
+	}
+	return
+}
+
+_build_environment_block :: proc(environment: []string, allocator: runtime.Allocator) -> string {
+	builder := strings.builder_make(allocator)
+	loop: #reverse for kv, cur_idx in environment {
+		eq_idx := strings.index_byte(kv, '=')
+		assert(eq_idx >= 0, "Malformed environment string. Expected '=' to separate keys and values")
+		key := kv[:eq_idx]
+		for old_kv in environment[cur_idx+1:] {
+			old_key := old_kv[:strings.index_byte(old_kv, '=')]
+			if key == old_key {
+				continue loop
+			}
+		}
+		strings.write_string(&builder, kv)
+		strings.write_byte(&builder, 0)
+	}
+	// Note(flysand): In addition to the NUL-terminator for each string, the
+	// environment block itself is NUL-terminated.
+	strings.write_byte(&builder, 0)
+	return strings.to_string(builder)
+}
diff --git a/core/os/os2/stat.odin b/core/os/os2/stat.odin
index f79ad9165..b3ca47be3 100644
--- a/core/os/os2/stat.odin
+++ b/core/os/os2/stat.odin
@@ -1,21 +1,34 @@
 package os2
 
-import "core:time"
 import "base:runtime"
+import "core:path/filepath"
+import "core:strings"
+import "core:time"
 
 Fstat_Callback :: proc(f: ^File, allocator: runtime.Allocator) -> (File_Info, Error)
 
 File_Info :: struct {
 	fullpath:          string,
 	name:              string,
-	size:              i64,
-	mode:              File_Mode,
-	is_directory:      bool,
+
+	inode:             u128, // might be zero if cannot be determined
+	size:              i64 `fmt:"M"`,
+	mode:              int `fmt:"o"`,
+	type:              File_Type,
+
 	creation_time:     time.Time,
 	modification_time: time.Time,
 	access_time:       time.Time,
 }
 
+@(require_results)
+file_info_clone :: proc(fi: File_Info, allocator: runtime.Allocator) -> (cloned: File_Info, err: runtime.Allocator_Error) {
+	cloned = fi
+	cloned.fullpath = strings.clone(fi.fullpath) or_return
+	cloned.name = filepath.base(cloned.fullpath)
+	return
+}
+
 file_info_slice_delete :: proc(infos: []File_Info, allocator: runtime.Allocator) {
 	for i := len(infos)-1; i >= 0; i -= 1 {
 		file_info_delete(infos[i], allocator)
@@ -29,10 +42,12 @@ file_info_delete :: proc(fi: File_Info, allocator: runtime.Allocator) {
 
 @(require_results)
 fstat :: proc(f: ^File, allocator: runtime.Allocator) -> (File_Info, Error) {
-	if f != nil && f.user_fstat != nil {
-		return f->user_fstat(allocator)
+	if f == nil {
+		return {}, nil
+	} else if f.fstat != nil {
+		return f->fstat(allocator)
 	}
-	return _fstat(f, allocator)
+	return {}, .Invalid_Callback
 }
 
 @(require_results)
@@ -41,6 +56,7 @@ stat :: proc(name: string, allocator: runtime.Allocator) -> (File_Info, Error) {
 }
 
 lstat :: stat_do_not_follow_links
+
 @(require_results)
 stat_do_not_follow_links :: proc(name: string, allocator: runtime.Allocator) -> (File_Info, Error) {
 	return _lstat(name, allocator)
@@ -51,3 +67,21 @@ stat_do_not_follow_links :: proc(name: string, allocator: runtime.Allocator) ->
 same_file :: proc(fi1, fi2: File_Info) -> bool {
 	return _same_file(fi1, fi2)
 }
+
+
+last_write_time         :: modification_time
+last_write_time_by_name :: modification_time_by_path
+
+@(require_results)
+modification_time :: proc(f: ^File) -> (time.Time, Error) {
+	TEMP_ALLOCATOR_GUARD()
+	fi, err := fstat(f, temp_allocator())
+	return fi.modification_time, err
+}
+
+@(require_results)
+modification_time_by_path :: proc(path: string) -> (time.Time, Error) {
+	TEMP_ALLOCATOR_GUARD()
+	fi, err := stat(path, temp_allocator())
+	return fi.modification_time, err
+}
diff --git a/core/os/os2/stat_linux.odin b/core/os/os2/stat_linux.odin
index c0b3088b4..6ccac1be0 100644
--- a/core/os/os2/stat_linux.odin
+++ b/core/os/os2/stat_linux.odin
@@ -7,31 +7,43 @@ import "core:sys/linux"
 import "core:path/filepath"
 
 _fstat :: proc(f: ^File, allocator: runtime.Allocator) -> (File_Info, Error) {
-	return _fstat_internal(f.impl.fd, allocator)
+	impl := (^File_Impl)(f.impl)
+	return _fstat_internal(impl.fd, allocator)
 }
 
-_fstat_internal :: proc(fd: linux.Fd, allocator: runtime.Allocator) -> (File_Info, Error) {
+_fstat_internal :: proc(fd: linux.Fd, allocator: runtime.Allocator) -> (fi: File_Info, err: Error) {
 	s: linux.Stat
 	errno := linux.fstat(fd, &s)
 	if errno != .NONE {
 		return {}, _get_platform_error(errno)
 	}
+	type := File_Type.Regular
+	switch s.mode & linux.S_IFMT {
+	case linux.S_IFBLK:  type = .Block_Device
+	case linux.S_IFCHR:  type = .Character_Device
+	case linux.S_IFDIR:  type = .Directory
+	case linux.S_IFIFO:  type = .Named_Pipe
+	case linux.S_IFLNK:  type = .Symlink
+	case linux.S_IFREG:  type = .Regular
+	case linux.S_IFSOCK: type = .Socket
+	}
+	mode := int(0o7777 & transmute(u32)s.mode)
 
 	// TODO: As of Linux 4.11, the new statx syscall can retrieve creation_time
-	fi := File_Info {
-		fullpath = _get_full_path(fd, allocator),
-		name = "",
-		size = i64(s.size),
-		mode = 0,
-		is_directory = linux.S_ISDIR(s.mode),
+	fi = File_Info {
+		fullpath          = _get_full_path(fd, allocator) or_return,
+		name              = "",
+		inode             = u128(u64(s.ino)),
+		size              = i64(s.size),
+		mode              = mode,
+		type              = type,
 		modification_time = time.Time {i64(s.mtime.time_sec) * i64(time.Second) + i64(s.mtime.time_nsec)},
-		access_time = time.Time {i64(s.atime.time_sec) * i64(time.Second) + i64(s.atime.time_nsec)},
-		creation_time = time.Time{i64(s.ctime.time_sec) * i64(time.Second) + i64(s.ctime.time_nsec)}, // regular stat does not provide this
+		access_time       = time.Time {i64(s.atime.time_sec) * i64(time.Second) + i64(s.atime.time_nsec)},
+		creation_time     = time.Time{i64(s.ctime.time_sec) * i64(time.Second) + i64(s.ctime.time_nsec)}, // regular stat does not provide this
 	}
 	fi.creation_time = fi.modification_time
-
 	fi.name = filepath.base(fi.fullpath)
-	return fi, nil
+	return
 }
 
 // NOTE: _stat and _lstat are using _fstat to avoid a race condition when populating fullpath
diff --git a/core/os/os2/stat_windows.odin b/core/os/os2/stat_windows.odin
index 03ad2052f..3a3a3b1b4 100644
--- a/core/os/os2/stat_windows.odin
+++ b/core/os/os2/stat_windows.odin
@@ -7,7 +7,7 @@ import "core:strings"
 import win32 "core:sys/windows"
 
 _fstat :: proc(f: ^File, allocator: runtime.Allocator) -> (File_Info, Error) {
-	if f == nil || f.impl.fd == nil {
+	if f == nil || (^File_Impl)(f.impl).fd == nil {
 		return {}, nil
 	}
 
@@ -19,28 +19,29 @@ _fstat :: proc(f: ^File, allocator: runtime.Allocator) -> (File_Info, Error) {
 	h := _handle(f)
 	switch win32.GetFileType(h) {
 	case win32.FILE_TYPE_PIPE, win32.FILE_TYPE_CHAR:
-		fi: File_Info
-		fi.fullpath = path
-		fi.name = basename(path)
-		fi.mode |= file_type_mode(h)
+		fi := File_Info {
+			fullpath = path,
+			name = basename(path),
+			type = file_type(h),
+		}
 		return fi, nil
 	}
 
 	return _file_info_from_get_file_information_by_handle(path, h, allocator)
 }
+
 _stat :: proc(name: string, allocator: runtime.Allocator) -> (File_Info, Error) {
 	return internal_stat(name, win32.FILE_FLAG_BACKUP_SEMANTICS, allocator)
 }
+
 _lstat :: proc(name: string, allocator: runtime.Allocator) -> (File_Info, Error) {
 	return internal_stat(name, win32.FILE_FLAG_BACKUP_SEMANTICS|win32.FILE_FLAG_OPEN_REPARSE_POINT, allocator)
 }
+
 _same_file :: proc(fi1, fi2: File_Info) -> bool {
 	return fi1.fullpath == fi2.fullpath
 }
 
-
-
-
 full_path_from_name :: proc(name: string, allocator: runtime.Allocator) -> (path: string, err: Error) {
 	name := name
 	if name == "" {
@@ -48,7 +49,7 @@ full_path_from_name :: proc(name: string, allocator: runtime.Allocator) -> (path
 	}
 	TEMP_ALLOCATOR_GUARD()
 
-	p := win32.utf8_to_utf16(name, temp_allocator())
+	p := win32_utf8_to_utf16(name, temp_allocator()) or_return
 
 	n := win32.GetFullPathNameW(raw_data(p), 0, nil, nil)
 	if n == 0 {
@@ -59,16 +60,16 @@ full_path_from_name :: proc(name: string, allocator: runtime.Allocator) -> (path
 	if n == 0 {
 		return "", _get_platform_error()
 	}
-	return win32.utf16_to_utf8(buf[:n], allocator)
+	return win32_utf16_to_utf8(buf[:n], allocator)
 }
 
-
 internal_stat :: proc(name: string, create_file_attributes: u32, allocator: runtime.Allocator) -> (fi: File_Info, e: Error) {
 	if len(name) == 0 {
 		return {}, .Not_Exist
 	}
+	TEMP_ALLOCATOR_GUARD()
 
-	wname := _fix_long_path(name)
+	wname := _fix_long_path(name, temp_allocator()) or_return
 	fa: win32.WIN32_FILE_ATTRIBUTE_DATA
 	ok := win32.GetFileAttributesExW(wname, win32.GetFileExInfoStandard, &fa)
 	if ok && fa.dwFileAttributes & win32.FILE_ATTRIBUTE_REPARSE_POINT == 0 {
@@ -99,7 +100,6 @@ internal_stat :: proc(name: string, create_file_attributes: u32, allocator: runt
 	return _file_info_from_get_file_information_by_handle(name, h, allocator)
 }
 
-
 _cleanpath_strip_prefix :: proc(buf: []u16) -> []u16 {
 	buf := buf
 	N := 0
@@ -120,9 +120,8 @@ _cleanpath_strip_prefix :: proc(buf: []u16) -> []u16 {
 	return buf
 }
 
-
 _cleanpath_from_handle :: proc(f: ^File, allocator: runtime.Allocator) -> (string, Error) {
-	if f == nil || f.impl.fd == nil {
+	if f == nil {
 		return "", nil
 	}
 	h := _handle(f)
@@ -138,7 +137,7 @@ _cleanpath_from_handle :: proc(f: ^File, allocator: runtime.Allocator) -> (strin
 }
 
 _cleanpath_from_handle_u16 :: proc(f: ^File) -> ([]u16, Error) {
-	if f == nil || f.impl.fd == nil {
+	if f  == nil {
 		return nil, nil
 	}
 	h := _handle(f)
@@ -156,10 +155,9 @@ _cleanpath_from_handle_u16 :: proc(f: ^File) -> ([]u16, Error) {
 _cleanpath_from_buf :: proc(buf: []u16, allocator: runtime.Allocator) -> (string, runtime.Allocator_Error) {
 	buf := buf
 	buf = _cleanpath_strip_prefix(buf)
-	return win32.utf16_to_utf8(buf, allocator)
+	return win32_utf16_to_utf8(buf, allocator)
 }
 
-
 basename :: proc(name: string) -> (base: string) {
 	name := name
 	if len(name) > 3 && name[:3] == `\\?` {
@@ -185,83 +183,67 @@ basename :: proc(name: string) -> (base: string) {
 	return name
 }
 
-
-file_type_mode :: proc(h: win32.HANDLE) -> File_Mode {
+file_type :: proc(h: win32.HANDLE) -> File_Type {
 	switch win32.GetFileType(h) {
-	case win32.FILE_TYPE_PIPE:
-		return File_Mode_Named_Pipe
-	case win32.FILE_TYPE_CHAR:
-		return File_Mode_Device | File_Mode_Char_Device
+	case win32.FILE_TYPE_PIPE: return .Named_Pipe
+	case win32.FILE_TYPE_CHAR: return .Character_Device
+	case win32.FILE_TYPE_DISK: return .Regular
 	}
-	return 0
+	return .Undetermined
 }
 
-
-
-_file_mode_from_file_attributes :: proc(file_attributes: win32.DWORD, h: win32.HANDLE, ReparseTag: win32.DWORD) -> (mode: File_Mode) {
+_file_type_mode_from_file_attributes :: proc(file_attributes: win32.DWORD, h: win32.HANDLE, ReparseTag: win32.DWORD) -> (type: File_Type, mode: int) {
 	if file_attributes & win32.FILE_ATTRIBUTE_READONLY != 0 {
 		mode |= 0o444
 	} else {
 		mode |= 0o666
 	}
-
 	is_sym := false
 	if file_attributes & win32.FILE_ATTRIBUTE_REPARSE_POINT == 0 {
 		is_sym = false
 	} else {
 		is_sym = ReparseTag == win32.IO_REPARSE_TAG_SYMLINK || ReparseTag == win32.IO_REPARSE_TAG_MOUNT_POINT
 	}
-
 	if is_sym {
-		mode |= File_Mode_Sym_Link
+		type = .Symlink
 	} else {
 		if file_attributes & win32.FILE_ATTRIBUTE_DIRECTORY != 0 {
-			mode |= 0o111 | File_Mode_Dir
+			type = .Directory
+			mode |= 0o111
 		}
-
 		if h != nil {
-			mode |= file_type_mode(h)
+			type = file_type(h)
 		}
 	}
-
 	return
 }
 
-
 _file_info_from_win32_file_attribute_data :: proc(d: ^win32.WIN32_FILE_ATTRIBUTE_DATA, name: string, allocator: runtime.Allocator) -> (fi: File_Info, e: Error) {
 	fi.size = i64(d.nFileSizeHigh)<<32 + i64(d.nFileSizeLow)
-
-	fi.mode |= _file_mode_from_file_attributes(d.dwFileAttributes, nil, 0)
-	fi.is_directory = fi.mode & File_Mode_Dir != 0
-
+	type, mode := _file_type_mode_from_file_attributes(d.dwFileAttributes, nil, 0)
+	fi.type = type
+	fi.mode |= mode
 	fi.creation_time     = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftCreationTime))
 	fi.modification_time = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftLastWriteTime))
 	fi.access_time       = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftLastAccessTime))
-
 	fi.fullpath, e = full_path_from_name(name, allocator)
 	fi.name = basename(fi.fullpath)
-
 	return
 }
 
-
 _file_info_from_win32_find_data :: proc(d: ^win32.WIN32_FIND_DATAW, name: string, allocator: runtime.Allocator) -> (fi: File_Info, e: Error) {
 	fi.size = i64(d.nFileSizeHigh)<<32 + i64(d.nFileSizeLow)
-
-	fi.mode |= _file_mode_from_file_attributes(d.dwFileAttributes, nil, 0)
-	fi.is_directory = fi.mode & File_Mode_Dir != 0
-
+	type, mode := _file_type_mode_from_file_attributes(d.dwFileAttributes, nil, 0)
+	fi.type = type
+	fi.mode |= mode
 	fi.creation_time     = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftCreationTime))
 	fi.modification_time = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftLastWriteTime))
 	fi.access_time       = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftLastAccessTime))
-
 	fi.fullpath, e = full_path_from_name(name, allocator)
 	fi.name = basename(fi.fullpath)
-
 	return
 }
 
-
 _file_info_from_get_file_information_by_handle :: proc(path: string, h: win32.HANDLE, allocator: runtime.Allocator) -> (File_Info, Error) {
 	d: win32.BY_HANDLE_FILE_INFORMATION
 	if !win32.GetFileInformationByHandle(h, &d) {
@@ -278,25 +260,20 @@ _file_info_from_get_file_information_by_handle :: proc(path: string, h: win32.HA
 		// Indicate this is a symlink on FAT file systems
 		ti.ReparseTag = 0
 	}
-
 	fi: File_Info
-
 	fi.fullpath = path
 	fi.name = basename(path)
-	fi.size = i64(d.nFileSizeHigh)<<32 + i64(d.nFileSizeLow)
-
-	fi.mode |= _file_mode_from_file_attributes(ti.FileAttributes, h, ti.ReparseTag)
-	fi.is_directory = fi.mode & File_Mode_Dir != 0
-
+	fi.inode = u128(u64(d.nFileIndexHigh)<<32 + u64(d.nFileIndexLow))
+	fi.size  = i64(d.nFileSizeHigh)<<32  + i64(d.nFileSizeLow)
+	type, mode := _file_type_mode_from_file_attributes(d.dwFileAttributes, nil, 0)
+	fi.type = type
+	fi.mode |= mode
 	fi.creation_time     = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftCreationTime))
 	fi.modification_time = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftLastWriteTime))
 	fi.access_time       = time.unix(0, win32.FILETIME_as_unix_nanoseconds(d.ftLastAccessTime))
-
 	return fi, nil
 }
 
-
-
 reserved_names := [?]string{
 	"CON", "PRN", "AUX", "NUL",
 	"COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9",
@@ -357,7 +334,6 @@ _volume_name_len :: proc(path: string) -> int {
 	return 0
 }
 
-
 _is_abs :: proc(path: string) -> bool {
 	if _is_reserved_name(path) {
 		return true
diff --git a/core/os/os2/temp_file.odin b/core/os/os2/temp_file.odin
index 3b3dbdd57..467775e89 100644
--- a/core/os/os2/temp_file.odin
+++ b/core/os/os2/temp_file.odin
@@ -26,7 +26,7 @@ create_temp_file :: proc(dir, pattern: string) -> (f: ^File, err: Error) {
 	attempts := 0
 	for {
 		name := concatenate_strings_from_buffer(name_buf[:], prefix, random_string(rand_buf[:]), suffix)
-		f, err = open(name, {.Read, .Write, .Create, .Excl}, File_Mode(0o666))
+		f, err = open(name, {.Read, .Write, .Create, .Excl}, 0o666)
 		if err == .Exist {
 			close(f)
 			attempts += 1
diff --git a/core/os/os2/temp_file_windows.odin b/core/os/os2/temp_file_windows.odin
index 4c8ab9fb7..d888eda52 100644
--- a/core/os/os2/temp_file_windows.odin
+++ b/core/os/os2/temp_file_windows.odin
@@ -19,5 +19,5 @@ _temp_dir :: proc(allocator: runtime.Allocator) -> (string, runtime.Allocator_Er
 	} else if n > 0 && b[n-1] == '\\' {
 		n -= 1
 	}
-	return win32.utf16_to_utf8(b[:n], allocator)
+	return win32_utf16_to_utf8(b[:n], allocator)
 }
diff --git a/core/os/os_freestanding.odin b/core/os/os_freestanding.odin
index 55ce1d12e..c908e3738 100644
--- a/core/os/os_freestanding.odin
+++ b/core/os/os_freestanding.odin
@@ -1,4 +1,4 @@
-//+freestanding
+//+build freestanding
 package os
 
-#panic("package os does not support a freestanding target")
\ No newline at end of file
+#panic("package os does not support a freestanding target")
diff --git a/core/reflect/reflect.odin b/core/reflect/reflect.odin
index 332d91c6e..23c0f803e 100644
--- a/core/reflect/reflect.odin
+++ b/core/reflect/reflect.odin
@@ -143,7 +143,7 @@ when !ODIN_NO_RTTI {
 @(require_results)
 any_base :: proc(v: any) -> any {
 	v := v
-	if v != nil {
+	if v.id != nil {
 		v.id = typeid_base(v.id)
 	}
 	return v
@@ -151,7 +151,7 @@ any_base :: proc(v: any) -> any {
 @(require_results)
 any_core :: proc(v: any) -> any {
 	v := v
-	if v != nil {
+	if v.id != nil {
 		v.id = typeid_core(v.id)
 	}
 	return v
@@ -391,7 +391,7 @@ Struct_Field :: struct {
 struct_field_at :: proc(T: typeid, i: int) -> (field: Struct_Field) {
 	ti := runtime.type_info_base(type_info_of(T))
 	if s, ok := ti.variant.(runtime.Type_Info_Struct); ok {
-		if 0 <= i && i < len(s.names) {
+		if 0 <= i && i < int(s.field_count) {
 			field.name     = s.names[i]
 			field.type     = s.types[i]
 			field.tag      = Struct_Tag(s.tags[i])
@@ -406,7 +406,7 @@ struct_field_at :: proc(T: typeid, i: int) -> (field: Struct_Field) {
 struct_field_by_name :: proc(T: typeid, name: string) -> (field: Struct_Field) {
 	ti := runtime.type_info_base(type_info_of(T))
 	if s, ok := ti.variant.(runtime.Type_Info_Struct); ok {
-		for fname, i in s.names {
+		for fname, i in s.names[:s.field_count] {
 			if fname == name {
 				field.name     = s.names[i]
 				field.type     = s.types[i]
@@ -427,7 +427,7 @@ struct_field_value_by_name :: proc(a: any, field: string, allow_using := false)
 	ti := runtime.type_info_base(type_info_of(a.id))
 
 	if s, ok := ti.variant.(runtime.Type_Info_Struct); ok {
-		for name, i in s.names {
+		for name, i in s.names[:s.field_count] {
 			if name == field {
 				return any{
 					rawptr(uintptr(a.data) + s.offsets[i]),
@@ -463,7 +463,7 @@ struct_field_value :: proc(a: any, field: Struct_Field) -> any {
 struct_field_names :: proc(T: typeid) -> []string {
 	ti := runtime.type_info_base(type_info_of(T))
 	if s, ok := ti.variant.(runtime.Type_Info_Struct); ok {
-		return s.names
+		return s.names[:s.field_count]
 	}
 	return nil
 }
@@ -472,7 +472,7 @@ struct_field_names :: proc(T: typeid) -> []string {
 struct_field_types :: proc(T: typeid) -> []^Type_Info {
 	ti := runtime.type_info_base(type_info_of(T))
 	if s, ok := ti.variant.(runtime.Type_Info_Struct); ok {
-		return s.types
+		return s.types[:s.field_count]
 	}
 	return nil
 }
@@ -482,7 +482,7 @@ struct_field_types :: proc(T: typeid) -> []^Type_Info {
 struct_field_tags :: proc(T: typeid) -> []Struct_Tag {
 	ti := runtime.type_info_base(type_info_of(T))
 	if s, ok := ti.variant.(runtime.Type_Info_Struct); ok {
-		return transmute([]Struct_Tag)s.tags
+		return transmute([]Struct_Tag)s.tags[:s.field_count]
 	}
 	return nil
 }
@@ -491,7 +491,7 @@ struct_field_tags :: proc(T: typeid) -> []Struct_Tag {
 struct_field_offsets :: proc(T: typeid) -> []uintptr {
 	ti := runtime.type_info_base(type_info_of(T))
 	if s, ok := ti.variant.(runtime.Type_Info_Struct); ok {
-		return s.offsets
+		return s.offsets[:s.field_count]
 	}
 	return nil
 }
@@ -501,11 +501,11 @@ struct_fields_zipped :: proc(T: typeid) -> (fields: #soa[]Struct_Field) {
 	ti := runtime.type_info_base(type_info_of(T))
 	if s, ok := ti.variant.(runtime.Type_Info_Struct); ok {
 		return soa_zip(
-			name     = s.names,
-			type     = s.types,
-			tag      = transmute([]Struct_Tag)s.tags,
-			offset   = s.offsets,
-			is_using = s.usings,
+			name     = s.names[:s.field_count],
+			type     = s.types[:s.field_count],
+			tag      = ([^]Struct_Tag)(s.tags)[:s.field_count],
+			offset   = s.offsets[:s.field_count],
+			is_using = s.usings[:s.field_count],
 		)
 	}
 	return nil
@@ -1569,7 +1569,7 @@ equal :: proc(a, b: any, including_indirect_array_recursion := false, recursion_
 		if v.equal != nil {
 			return v.equal(a.data, b.data)
 		} else {
-			for offset, i in v.offsets {
+			for offset, i in v.offsets[:v.field_count] {
 				x := rawptr(uintptr(a.data) + offset)
 				y := rawptr(uintptr(b.data) + offset)
 				id := v.types[i].id
diff --git a/core/reflect/types.odin b/core/reflect/types.odin
index 04dd8a52d..4f0674dc8 100644
--- a/core/reflect/types.odin
+++ b/core/reflect/types.odin
@@ -115,16 +115,14 @@ are_types_identical :: proc(a, b: ^Type_Info) -> bool {
 	case Type_Info_Struct:
 		y := b.variant.(Type_Info_Struct) or_return
 		switch {
-		case len(x.types)    != len(y.types),
-		     x.is_packed     != y.is_packed,
-		     x.is_raw_union  != y.is_raw_union,
-		     x.custom_align  != y.custom_align,
+		case x.field_count   != y.field_count,
+		     x.flags         != y.flags,
 		     x.soa_kind      != y.soa_kind,
 		     x.soa_base_type != y.soa_base_type,
 		     x.soa_len       != y.soa_len:
 			return false
 		}
-		for _, i in x.types {
+		for i in 0..<x.field_count {
 			xn, yn := x.names[i], y.names[i]
 			xt, yt := x.types[i], y.types[i]
 			xl, yl := x.tags[i],  y.tags[i]
@@ -179,8 +177,8 @@ are_types_identical :: proc(a, b: ^Type_Info) -> bool {
 	case Type_Info_Bit_Field:
 		y := b.variant.(Type_Info_Bit_Field) or_return
 		if !are_types_identical(x.backing_type, y.backing_type) { return false }
-		if len(x.names) != len(y.names) { return false }
-		for _, i in x.names {
+		if x.field_count != y.field_count { return false }
+		for _, i in x.names[:x.field_count] {
 			if x.names[i] != y.names[i] {
 				return false
 			}
@@ -368,13 +366,13 @@ is_tuple :: proc(info: ^Type_Info) -> bool {
 is_struct :: proc(info: ^Type_Info) -> bool {
 	if info == nil { return false }
 	s, ok := type_info_base(info).variant.(Type_Info_Struct)
-	return ok && !s.is_raw_union
+	return ok && .raw_union not_in s.flags
 }
 @(require_results)
 is_raw_union :: proc(info: ^Type_Info) -> bool {
 	if info == nil { return false }
 	s, ok := type_info_base(info).variant.(Type_Info_Struct)
-	return ok && s.is_raw_union
+	return ok && .raw_union in s.flags
 }
 @(require_results)
 is_union :: proc(info: ^Type_Info) -> bool {
@@ -495,7 +493,7 @@ write_type_builder :: proc(buf: ^strings.Builder, ti: ^Type_Info) -> int {
 	n, _ := write_type_writer(strings.to_writer(buf), ti)
 	return n
 }
-write_type_writer :: proc(w: io.Writer, ti: ^Type_Info, n_written: ^int = nil) -> (n: int, err: io.Error) {
+write_type_writer :: #force_no_inline proc(w: io.Writer, ti: ^Type_Info, n_written: ^int = nil) -> (n: int, err: io.Error) {
 	defer if n_written != nil {
 		n_written^ += n
 	}
@@ -656,15 +654,16 @@ write_type_writer :: proc(w: io.Writer, ti: ^Type_Info, n_written: ^int = nil) -
 		}
 
 		io.write_string(w, "struct ", &n) or_return
-		if info.is_packed    { io.write_string(w, "#packed ",    &n) or_return }
-		if info.is_raw_union { io.write_string(w, "#raw_union ", &n) or_return }
-		if info.custom_align {
+		if .packed    in info.flags { io.write_string(w, "#packed ",    &n) or_return }
+		if .raw_union in info.flags { io.write_string(w, "#raw_union ", &n) or_return }
+		if .no_copy   in info.flags { io.write_string(w, "#no_copy ", &n) or_return }
+		if .align in info.flags {
 			io.write_string(w, "#align(",      &n) or_return
 			io.write_i64(w, i64(ti.align), 10, &n) or_return
 			io.write_string(w, ") ",           &n) or_return
 		}
 		io.write_byte(w, '{', &n) or_return
-		for name, i in info.names {
+		for name, i in info.names[:info.field_count] {
 			if i > 0 { io.write_string(w, ", ", &n) or_return }
 			io.write_string(w, name,     &n) or_return
 			io.write_string(w, ": ",     &n) or_return
@@ -722,7 +721,7 @@ write_type_writer :: proc(w: io.Writer, ti: ^Type_Info, n_written: ^int = nil) -
 		io.write_string(w, "bit_field ", &n) or_return
 		write_type(w, info.backing_type, &n) or_return
 		io.write_string(w, " {",         &n) or_return
-		for name, i in info.names {
+		for name, i in info.names[:info.field_count] {
 			if i > 0 { io.write_string(w, ", ", &n) or_return }
 			io.write_string(w, name,     &n) or_return
 			io.write_string(w, ": ",     &n) or_return
diff --git a/core/simd/x86/aes.odin b/core/simd/x86/aes.odin
index 3a32de0d6..a2cd2e4d3 100644
--- a/core/simd/x86/aes.odin
+++ b/core/simd/x86/aes.odin
@@ -2,33 +2,33 @@
 package simd_x86
 
 @(require_results, enable_target_feature = "aes")
-_mm_aesdec :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+_mm_aesdec_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
 	return aesdec(a, b)
 }
 
 @(require_results, enable_target_feature = "aes")
-_mm_aesdeclast :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+_mm_aesdeclast_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
 	return aesdeclast(a, b)
 }
 
 @(require_results, enable_target_feature = "aes")
-_mm_aesenc :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+_mm_aesenc_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
 	return aesenc(a, b)
 }
 
 @(require_results, enable_target_feature = "aes")
-_mm_aesenclast :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
+_mm_aesenclast_si128 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
 	return aesenclast(a, b)
 }
 
 @(require_results, enable_target_feature = "aes")
-_mm_aesimc :: #force_inline proc "c" (a: __m128i) -> __m128i {
+_mm_aesimc_si128 :: #force_inline proc "c" (a: __m128i) -> __m128i {
 	return aesimc(a)
 }
 
 @(require_results, enable_target_feature = "aes")
-_mm_aeskeygenassist :: #force_inline proc "c" (a: __m128i, $IMM8: u8) -> __m128i {
-	return aeskeygenassist(a, u8(IMM8))
+_mm_aeskeygenassist_si128 :: #force_inline proc "c" (a: __m128i, $IMM8: u8) -> __m128i {
+	return aeskeygenassist(a, IMM8)
 }
 
 
@@ -45,5 +45,5 @@ foreign _ {
 	@(link_name = "llvm.x86.aesni.aesimc")
 	aesimc :: proc(a: __m128i) -> __m128i ---
 	@(link_name = "llvm.x86.aesni.aeskeygenassist")
-	aeskeygenassist :: proc(a: __m128i, imm8: u8) -> __m128i ---
+	aeskeygenassist :: proc(a: __m128i, #const imm8: u8) -> __m128i ---
 }
diff --git a/core/simd/x86/sse2.odin b/core/simd/x86/sse2.odin
index 52286cbb8..426359031 100644
--- a/core/simd/x86/sse2.odin
+++ b/core/simd/x86/sse2.odin
@@ -144,19 +144,26 @@ _mm_subs_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
 _mm_slli_si128_impl :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> __m128i {
 	shift :: IMM8 & 0xff
 
+	// This needs to emit behavior identical to PSLLDQ which is as follows:
+	//
+	// TEMP := COUNT
+	// IF (TEMP > 15) THEN TEMP := 16; FI
+	// DEST := DEST << (TEMP * 8)
+	// DEST[MAXVL-1:128] (Unmodified)
+
 	return transmute(__m128i)simd.shuffle(
-		transmute(i8x16)a,
 		i8x16(0),
-		0  when shift > 15 else (16 - shift + 0),
-		1  when shift > 15 else (16 - shift + 1),
-		2  when shift > 15 else (16 - shift + 2),
-		3  when shift > 15 else (16 - shift + 3),
-		4  when shift > 15 else (16 - shift + 4),
-		5  when shift > 15 else (16 - shift + 5),
-		6  when shift > 15 else (16 - shift + 6),
-		7  when shift > 15 else (16 - shift + 7),
-		8  when shift > 15 else (16 - shift + 8),
-		9  when shift > 15 else (16 - shift + 9),
+		transmute(i8x16)a,
+		0 when shift > 15 else (16 - shift + 0),
+		1 when shift > 15 else (16 - shift + 1),
+		2 when shift > 15 else (16 - shift + 2),
+		3 when shift > 15 else (16 - shift + 3),
+		4 when shift > 15 else (16 - shift + 4),
+		5 when shift > 15 else (16 - shift + 5),
+		6 when shift > 15 else (16 - shift + 6),
+		7 when shift > 15 else (16 - shift + 7),
+		8 when shift > 15 else (16 - shift + 8),
+		9 when shift > 15 else (16 - shift + 9),
 		10 when shift > 15 else (16 - shift + 10),
 		11 when shift > 15 else (16 - shift + 11),
 		12 when shift > 15 else (16 - shift + 12),
@@ -435,7 +442,7 @@ _mm_store_si128 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) {
 }
 @(enable_target_feature="sse2")
 _mm_storeu_si128 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) {
-	storeudq(mem_addr, a)
+	intrinsics.unaligned_store(mem_addr, a)
 }
 @(enable_target_feature="sse2")
 _mm_storel_epi64 :: #force_inline proc "c" (mem_addr: ^__m128i, a: __m128i) {
@@ -1178,8 +1185,6 @@ foreign _ {
 	cvttsd2si  :: proc(a: __m128d) -> i32 ---
 	@(link_name="llvm.x86.sse2.cvttps2dq")
 	cvttps2dq  :: proc(a: __m128) -> i32x4 ---
-	@(link_name="llvm.x86.sse2.storeu.dq")
-	storeudq   :: proc(mem_addr: rawptr, a: __m128i) ---
 	@(link_name="llvm.x86.sse2.storeu.pd")
 	storeupd   :: proc(mem_addr: rawptr, a: __m128d) ---
 
diff --git a/core/sys/linux/bits.odin b/core/sys/linux/bits.odin
index 1e9e5bbbd..e10edf558 100644
--- a/core/sys/linux/bits.odin
+++ b/core/sys/linux/bits.odin
@@ -244,7 +244,7 @@ Mode_Bits :: enum {
 	ISVTX  = 9,  // 0o0001000
 	ISGID  = 10, // 0o0002000
 	ISUID  = 11, // 0o0004000
-	IFFIFO = 12, // 0o0010000
+	IFIFO = 12, // 0o0010000
 	IFCHR  = 13, // 0o0020000
 	IFDIR  = 14, // 0o0040000
 	IFREG  = 15, // 0o0100000
@@ -1815,3 +1815,11 @@ EPoll_Ctl_Opcode :: enum i32 {
 	DEL = 2,
 	MOD = 3,
 }
+
+/*
+	Bits for execveat(2) flags.
+*/
+Execveat_Flags_Bits :: enum {
+	AT_SYMLINK_NOFOLLOW = 8,
+	AT_EMPTY_PATH       = 12,
+}
diff --git a/core/sys/linux/constants.odin b/core/sys/linux/constants.odin
index 51f7db68f..f3e9f5ff9 100644
--- a/core/sys/linux/constants.odin
+++ b/core/sys/linux/constants.odin
@@ -39,11 +39,11 @@ PRIO_MIN :: -20
 SIGRTMIN :: Signal(32)
 SIGRTMAX :: Signal(64)
 
-S_IFMT   :: Mode{.IFREG, .IFDIR, .IFCHR, .IFFIFO}
+S_IFMT   :: Mode{.IFREG, .IFDIR, .IFCHR, .IFIFO}
 S_IFSOCK :: Mode{.IFREG, .IFDIR}
 S_IFLNK  :: Mode{.IFREG, .IFCHR}
 S_IFBLK  :: Mode{.IFDIR, .IFCHR}
-S_IFFIFO :: Mode{.IFFIFO}
+S_IFIFO  :: Mode{.IFIFO}
 S_IFCHR  :: Mode{.IFCHR}
 S_IFDIR  :: Mode{.IFDIR}
 S_IFREG  :: Mode{.IFREG}
@@ -51,7 +51,7 @@ S_IFREG  :: Mode{.IFREG}
 /*
 	Checks the Mode bits to see if the file is a named pipe (FIFO).
 */
-S_ISFIFO :: #force_inline proc "contextless" (m: Mode) -> bool {return (S_IFFIFO  == (m & S_IFMT))}
+S_ISFIFO :: #force_inline proc "contextless" (m: Mode) -> bool {return (S_IFIFO  == (m & S_IFMT))}
 
 /*
 	Check the Mode bits to see if the file is a character device.
diff --git a/core/sys/linux/helpers.odin b/core/sys/linux/helpers.odin
index 75fdd586e..f1abbbf61 100644
--- a/core/sys/linux/helpers.odin
+++ b/core/sys/linux/helpers.odin
@@ -12,7 +12,7 @@ import "base:intrinsics"
 
 @(private)
 syscall0 :: #force_inline proc "contextless" (nr: uintptr) -> int {
-	return cast(int) intrinsics.syscall(nr)
+	return int(intrinsics.syscall(nr))
 }
 
 @(private)
@@ -20,7 +20,7 @@ syscall1 :: #force_inline proc "contextless" (nr: uintptr, p1: $T) -> int
 where
 	size_of(p1) <= size_of(uintptr)
 {
-	return cast(int) intrinsics.syscall(nr, cast(uintptr) p1)
+	return int(intrinsics.syscall(nr, uintptr(p1)))
 }
 
 @(private)
@@ -29,8 +29,7 @@ where
 	size_of(p1) <= size_of(uintptr),
 	size_of(p2) <= size_of(uintptr) 
 {
-	return cast(int) intrinsics.syscall(nr,
-		cast(uintptr) p1, cast(uintptr) p2)
+	return int(intrinsics.syscall(nr, uintptr(p1), uintptr(p2)))
 }
 
 @(private)
@@ -40,10 +39,11 @@ where
 	size_of(p2) <= size_of(uintptr),
 	size_of(p3) <= size_of(uintptr)
 {
-	return cast(int) intrinsics.syscall(nr,
-		cast(uintptr) p1,
-		cast(uintptr) p2,
-		cast(uintptr) p3)
+	return int(intrinsics.syscall(nr,
+		uintptr(p1),
+		uintptr(p2),
+		uintptr(p3),
+	))
 }
 
 @(private)
@@ -54,11 +54,12 @@ where
 	size_of(p3) <= size_of(uintptr),
 	size_of(p4) <= size_of(uintptr)
 {
-	return cast(int) intrinsics.syscall(nr,
-		cast(uintptr) p1,
-		cast(uintptr) p2,
-		cast(uintptr) p3,
-		cast(uintptr) p4)
+	return int(intrinsics.syscall(nr,
+		uintptr(p1),
+		uintptr(p2),
+		uintptr(p3),
+		uintptr(p4),
+	))
 }
 
 @(private)
@@ -70,12 +71,13 @@ where
 	size_of(p4) <= size_of(uintptr),
 	size_of(p5) <= size_of(uintptr)
 {
-	return cast(int) intrinsics.syscall(nr,
-		cast(uintptr) p1,
-		cast(uintptr) p2,
-		cast(uintptr) p3,
-		cast(uintptr) p4,
-		cast(uintptr) p5)
+	return int(intrinsics.syscall(nr,
+		uintptr(p1),
+		uintptr(p2),
+		uintptr(p3),
+		uintptr(p4),
+		uintptr(p5),
+	))
 }
 
 @(private)
@@ -88,13 +90,14 @@ where
 	size_of(p5) <= size_of(uintptr),
 	size_of(p6) <= size_of(uintptr)
 {
-	return cast(int) intrinsics.syscall(nr,
-		cast(uintptr) p1,
-		cast(uintptr) p2,
-		cast(uintptr) p3,
-		cast(uintptr) p4,
-		cast(uintptr) p5,
-		cast(uintptr) p6)
+	return int(intrinsics.syscall(nr,
+		uintptr(p1),
+		uintptr(p2),
+		uintptr(p3),
+		uintptr(p4),
+		uintptr(p5),
+		uintptr(p6),
+	))
 }
 
 syscall :: proc {syscall0, syscall1, syscall2, syscall3, syscall4, syscall5, syscall6}
@@ -113,7 +116,7 @@ where
 		default_value: T
 		return default_value, Errno(-ret)
 	} else {
-		return cast(T) transmute(U) ret, Errno(.NONE)
+		return T(transmute(U)ret), Errno(.NONE)
 	}
 }
 
@@ -123,7 +126,7 @@ errno_unwrap2 :: #force_inline proc "contextless" (ret: $P, $T: typeid) -> (T, E
 		default_value: T
 		return default_value, Errno(-ret)
 	} else {
-		return cast(T) ret, Errno(.NONE)
+		return T(ret), Errno(.NONE)
 	}
 }
 
diff --git a/core/sys/linux/sys.odin b/core/sys/linux/sys.odin
index 90db862e2..ec7357c48 100644
--- a/core/sys/linux/sys.odin
+++ b/core/sys/linux/sys.odin
@@ -749,17 +749,13 @@ getsockopt :: proc {
 	getsockopt_base,
 }
 
-// TODO(flysand): clone (probably not in this PR, maybe not ever)
-
 /*
 	Creates a copy of the running process.
 	Available since Linux 1.0.
 */
 fork :: proc "contextless" () -> (Pid, Errno) {
 	when ODIN_ARCH == .arm64 {
-		// Note(flysand): this syscall is not documented, but the bottom 8 bits of flags
-		// are for exit signal
-		ret := syscall(SYS_clone, Signal.SIGCHLD)
+		ret := syscall(SYS_clone, u64(Signal.SIGCHLD), cast(rawptr) nil, cast(rawptr) nil, cast(rawptr) nil, u64(0))
 		return errno_unwrap(ret, Pid)
 	} else {
 		ret := syscall(SYS_fork)
@@ -789,7 +785,7 @@ execve :: proc "contextless" (name: cstring, argv: [^]cstring, envp: [^]cstring)
 		ret := syscall(SYS_execve, cast(rawptr) name, cast(rawptr) argv, cast(rawptr) envp)
 		return Errno(-ret)
 	} else {
-		ret := syscall(SYS_execveat, AT_FDCWD, cast(rawptr) name, cast(rawptr) argv, cast(rawptr) envp)
+		ret := syscall(SYS_execveat, AT_FDCWD, cast(rawptr) name, cast(rawptr) argv, cast(rawptr) envp, i32(0))
 		return Errno(-ret)
 	}
 }
@@ -2818,7 +2814,7 @@ getrandom :: proc "contextless" (buf: []u8, flags: Get_Random_Flags) -> (int, Er
 	Execute program relative to a directory file descriptor.
 	Available since Linux 3.19.
 */
-execveat :: proc "contextless" (dirfd: Fd, name: cstring, argv: [^]cstring, envp: [^]cstring, flags: FD_Flags = {}) -> (Errno) {
+execveat :: proc "contextless" (dirfd: Fd, name: cstring, argv: [^]cstring, envp: [^]cstring, flags: Execveat_Flags = {}) -> (Errno) {
 	ret := syscall(SYS_execveat, dirfd, cast(rawptr) name, cast(rawptr) argv, cast(rawptr) envp, transmute(i32) flags)
 	return Errno(-ret)
 }
diff --git a/core/sys/linux/types.odin b/core/sys/linux/types.odin
index e3fe67a9b..288edf879 100644
--- a/core/sys/linux/types.odin
+++ b/core/sys/linux/types.odin
@@ -688,7 +688,7 @@ Sock_Addr_In6 :: struct #packed {
 }
 
 /*
-  Struct representing Unix Domain Socket address
+	Struct representing Unix Domain Socket address
 */
 Sock_Addr_Un :: struct #packed {
 	sun_family: Address_Family,
@@ -1303,3 +1303,8 @@ EPoll_Event :: struct #packed {
 	events: EPoll_Event_Kind,
 	data:   EPoll_Data,
 }
+
+/*
+	Flags for execveat(2) syscall.
+*/
+Execveat_Flags :: bit_set[Execveat_Flags_Bits; i32]
diff --git a/core/sys/windows/ntdll.odin b/core/sys/windows/ntdll.odin
index 56c24f1a2..23444ff34 100644
--- a/core/sys/windows/ntdll.odin
+++ b/core/sys/windows/ntdll.odin
@@ -6,4 +6,254 @@ foreign import ntdll_lib "system:ntdll.lib"
 @(default_calling_convention="system")
 foreign ntdll_lib {
 	RtlGetVersion :: proc(lpVersionInformation: ^OSVERSIONINFOEXW) -> NTSTATUS ---
+
+
+	NtQueryInformationProcess :: proc(
+		ProcessHandle:            HANDLE,
+		ProcessInformationClass:  PROCESS_INFO_CLASS,
+		ProcessInformation:       rawptr,
+		ProcessInformationLength: u32,
+		ReturnLength:             ^u32,
+	) -> u32 ---
+
+	NtQueryInformationFile :: proc(
+		FileHandle:           HANDLE,
+		IoStatusBlock:        PIO_STATUS_BLOCK,
+		FileInformation:      rawptr,
+		Length:               ULONG,
+		FileInformationClass: FILE_INFORMATION_CLASS,
+	) -> NTSTATUS ---
+
+	NtQueryDirectoryFileEx :: proc(
+		FileHandle:           HANDLE,
+		Event:                HANDLE,
+		ApcRoutine:           PIO_APC_ROUTINE,
+		ApcContext:           PVOID,
+		IoStatusBlock:        PIO_STATUS_BLOCK,
+		FileInformation:      PVOID,
+		Length:               ULONG,
+		FileInformationClass: FILE_INFORMATION_CLASS,
+		QueryFlags:           ULONG,
+		FileName   :          PUNICODE_STRING,
+	) -> NTSTATUS ---
+}
+
+
+PIO_APC_ROUTINE :: #type proc "system" (ApcContext: rawptr, IoStatusBlock: PIO_STATUS_BLOCK, Reserved: ULONG)
+
+PIO_STATUS_BLOCK :: ^IO_STATUS_BLOCK
+IO_STATUS_BLOCK :: struct {
+	using _: struct #raw_union {
+		Status:  NTSTATUS,
+		Pointer: rawptr,
+	},
+	Information: ULONG_PTR,
+}
+
+
+PROCESS_INFO_CLASS :: enum c_int {
+	ProcessBasicInformation       = 0,
+	ProcessDebugPort              = 7,
+	ProcessWow64Information       = 26,
+	ProcessImageFileName          = 27,
+	ProcessBreakOnTermination     = 29,
+	ProcessTelemetryIdInformation = 64,
+	ProcessSubsystemInformation   = 75,
+}
+
+SL_RESTART_SCAN                :: 0x00000001 // The scan will start at the first entry in the directory. If this flag is not set, the scan will resume from where the last query ended.
+SL_RETURN_SINGLE_ENTRY         :: 0x00000002 // Normally the return buffer is packed with as many matching directory entries that fit. If this flag is set, the file system will return only one directory entry at a time. This does make the operation less efficient.
+SL_INDEX_SPECIFIED             :: 0x00000004 // The scan should start at a specified indexed position in the directory. This flag can only be set if you generate your own IRP_MJ_DIRECTORY_CONTROL IRP; the index is specified in the IRP. How the position is specified varies from file system to file system.
+SL_RETURN_ON_DISK_ENTRIES_ONLY :: 0x00000008 // Any file system filters that perform directory virtualization or just-in-time expansion should simply pass the request through to the file system and return entries that are currently on disk. Not all file systems support this flag.
+SL_NO_CURSOR_UPDATE_QUERY      :: 0x00000010 // File systems maintain per-FileObject directory cursor information. When multiple threads do queries using the same FileObject, access to the per-FileObject structure is single threaded to prevent corruption of the cursor state. This flag tells the file system to not update per-FileObject cursor state information thus allowing multiple threads to query in parallel using the same handle. It behaves as if SL_RESTART_SCAN is specified on each call. If a wild card pattern is given on the next call, the operation will not pick up where the last query ended. This allows for true asynchronous directory query support. If this flag is used inside a TxF transaction the operation will be failed. Not all file systems support this flag.
+
+
+PFILE_INFORMATION_CLASS :: ^FILE_INFORMATION_CLASS
+FILE_INFORMATION_CLASS :: enum c_int {
+	FileDirectoryInformation                     = 1,
+	FileFullDirectoryInformation                 = 2,
+	FileBothDirectoryInformation                 = 3,
+	FileBasicInformation                         = 4,
+	FileStandardInformation                      = 5,
+	FileInternalInformation                      = 6,
+	FileEaInformation                            = 7,
+	FileAccessInformation                        = 8,
+	FileNameInformation                          = 9,
+	FileRenameInformation                        = 10,
+	FileLinkInformation                          = 11,
+	FileNamesInformation                         = 12,
+	FileDispositionInformation                   = 13,
+	FilePositionInformation                      = 14,
+	FileFullEaInformation                        = 15,
+	FileModeInformation                          = 16,
+	FileAlignmentInformation                     = 17,
+	FileAllInformation                           = 18,
+	FileAllocationInformation                    = 19,
+	FileEndOfFileInformation                     = 20,
+	FileAlternateNameInformation                 = 21,
+	FileStreamInformation                        = 22,
+	FilePipeInformation                          = 23,
+	FilePipeLocalInformation                     = 24,
+	FilePipeRemoteInformation                    = 25,
+	FileMailslotQueryInformation                 = 26,
+	FileMailslotSetInformation                   = 27,
+	FileCompressionInformation                   = 28,
+	FileObjectIdInformation                      = 29,
+	FileCompletionInformation                    = 30,
+	FileMoveClusterInformation                   = 31,
+	FileQuotaInformation                         = 32,
+	FileReparsePointInformation                  = 33,
+	FileNetworkOpenInformation                   = 34,
+	FileAttributeTagInformation                  = 35,
+	FileTrackingInformation                      = 36,
+	FileIdBothDirectoryInformation               = 37,
+	FileIdFullDirectoryInformation               = 38,
+	FileValidDataLengthInformation               = 39,
+	FileShortNameInformation                     = 40,
+	FileIoCompletionNotificationInformation      = 41,
+	FileIoStatusBlockRangeInformation            = 42,
+	FileIoPriorityHintInformation                = 43,
+	FileSfioReserveInformation                   = 44,
+	FileSfioVolumeInformation                    = 45,
+	FileHardLinkInformation                      = 46,
+	FileProcessIdsUsingFileInformation           = 47,
+	FileNormalizedNameInformation                = 48,
+	FileNetworkPhysicalNameInformation           = 49,
+	FileIdGlobalTxDirectoryInformation           = 50,
+	FileIsRemoteDeviceInformation                = 51,
+	FileUnusedInformation                        = 52,
+	FileNumaNodeInformation                      = 53,
+	FileStandardLinkInformation                  = 54,
+	FileRemoteProtocolInformation                = 55,
+	FileRenameInformationBypassAccessCheck       = 56,
+	FileLinkInformationBypassAccessCheck         = 57,
+	FileVolumeNameInformation                    = 58,
+	FileIdInformation                            = 59,
+	FileIdExtdDirectoryInformation               = 60,
+	FileReplaceCompletionInformation             = 61,
+	FileHardLinkFullIdInformation                = 62,
+	FileIdExtdBothDirectoryInformation           = 63,
+	FileDispositionInformationEx                 = 64,
+	FileRenameInformationEx                      = 65,
+	FileRenameInformationExBypassAccessCheck     = 66,
+	FileDesiredStorageClassInformation           = 67,
+	FileStatInformation                          = 68,
+	FileMemoryPartitionInformation               = 69,
+	FileStatLxInformation                        = 70,
+	FileCaseSensitiveInformation                 = 71,
+	FileLinkInformationEx                        = 72,
+	FileLinkInformationExBypassAccessCheck       = 73,
+	FileStorageReserveIdInformation              = 74,
+	FileCaseSensitiveInformationForceAccessCheck = 75,
+	FileKnownFolderInformation                   = 76,
+	FileStatBasicInformation                     = 77,
+	FileId64ExtdDirectoryInformation             = 78,
+	FileId64ExtdBothDirectoryInformation         = 79,
+	FileIdAllExtdDirectoryInformation            = 80,
+	FileIdAllExtdBothDirectoryInformation        = 81,
+	FileStreamReservationInformation,
+	FileMupProviderInfo,
+	FileMaximumInformation,
+}
+
+PFILE_ID_FULL_DIR_INFORMATION :: ^FILE_ID_FULL_DIR_INFORMATION
+FILE_ID_FULL_DIR_INFORMATION :: struct {
+	NextEntryOffset: ULONG,
+	FileIndex:       ULONG,
+	CreationTime:    LARGE_INTEGER,
+	LastAccessTime:  LARGE_INTEGER,
+	LastWriteTime:   LARGE_INTEGER,
+	ChangeTime:      LARGE_INTEGER,
+	EndOfFile:       LARGE_INTEGER,
+	AllocationSize:  LARGE_INTEGER,
+	FileAttributes:  ULONG,
+	FileNameLength:  ULONG,
+	EaSize:          ULONG,
+	FileId:          LARGE_INTEGER,
+	FileName:        [1]WCHAR,
+}
+
+
+PROCESS_BASIC_INFORMATION :: struct {
+	ExitStatus:                   NTSTATUS,
+	PebBaseAddress:               ^PEB,
+	AffinityMask:                 ULONG_PTR,
+	BasePriority:                 KPRIORITY,
+	UniqueProcessId:              ULONG_PTR,
+	InheritedFromUniqueProcessId: ULONG_PTR,
+}
+
+KPRIORITY :: rawptr
+
+PPS_POST_PROCESS_INIT_ROUTINE :: proc "system" ()
+
+
+PEB :: struct {
+	_:                      [2]u8,
+	BeingDebugged:          u8,
+	_:                      [1]u8,
+	_:                      [2]rawptr,
+	Ldr:                    ^PEB_LDR_DATA,
+	ProcessParameters:      ^RTL_USER_PROCESS_PARAMETERS,
+	_:                      [104]u8,
+	_:                      [52]rawptr,
+	PostProcessInitRoutine: PPS_POST_PROCESS_INIT_ROUTINE,
+	_:                      [128]u8,
+	_:                      [1]rawptr,
+	SessionId:              u32,
+}
+
+
+
+
+PEB_LDR_DATA :: struct {
+	_: [8]u8,
+	_: [3]rawptr,
+	InMemoryOrderModuleList: LIST_ENTRY,
+}
+
+RTL_USER_PROCESS_PARAMETERS :: struct {
+	MaximumLength:          u32,
+	Length:                 u32,
+	Flags:                  u32,
+	DebugFlags:             u32,
+	ConsoleHandle:          rawptr,
+	ConsoleFlags:           u32,
+	StdInputHandle:         rawptr,
+	StdOutputHandle:        rawptr,
+	StdErrorHandle:         rawptr,
+	CurrentDirectoryPath:   UNICODE_STRING,
+	CurrentDirectoryHandle: rawptr,
+	DllPath:                UNICODE_STRING,
+	ImagePathName:          UNICODE_STRING,
+	CommandLine:            UNICODE_STRING,
+	Environment:            rawptr,
+	StartingPositionLeft:   u32,
+	StartingPositionTop:    u32,
+	Width:                  u32,
+	Height:                 u32,
+	CharWidth:              u32,
+	CharHeight:             u32,
+	ConsoleTextAttributes:  u32,
+	WindowFlags:            u32,
+	ShowWindowFlags:        u32,
+	WindowTitle:            UNICODE_STRING,
+	DesktopName:            UNICODE_STRING,
+	ShellInfo:              UNICODE_STRING,
+	RuntimeData:            UNICODE_STRING,
+	DLCurrentDirectory:     [32]RTL_DRIVE_LETTER_CURDIR,
+	EnvironmentSize:        u32,
+}
+
+RTL_DRIVE_LETTER_CURDIR :: struct {
+	Flags:     u16,
+	Length:    u16,
+	TimeStamp: u32,
+	DosPath:   UNICODE_STRING,
+}
+
+
+LIST_ENTRY :: struct {
+	Flink: ^LIST_ENTRY,
+	Blink: ^LIST_ENTRY,
 }
\ No newline at end of file
diff --git a/core/sys/windows/shell32.odin b/core/sys/windows/shell32.odin
index 3d464e847..7340ae4d4 100644
--- a/core/sys/windows/shell32.odin
+++ b/core/sys/windows/shell32.odin
@@ -32,6 +32,10 @@ foreign shell32 {
 	SHGetKnownFolderPath :: proc(rfid: REFKNOWNFOLDERID, dwFlags: /* KNOWN_FOLDER_FLAG */ DWORD, hToken: HANDLE, ppszPath: ^LPWSTR) -> HRESULT ---
 
 	ExtractIconExW :: proc(pszFile: LPCWSTR, nIconIndex: INT, phiconLarge: ^HICON, phiconSmall: ^HICON, nIcons: UINT) -> UINT ---
+	DragAcceptFiles :: proc(hWnd: HWND, fAccept: BOOL) ---
+	DragQueryPoint :: proc(hDrop: HDROP, ppt: ^POINT) -> BOOL ---
+	DragQueryFileW :: proc(hDrop: HDROP, iFile: UINT, lpszFile: LPWSTR, cch: UINT) -> UINT ---
+	DragFinish :: proc(hDrop: HDROP) --- // @New
 }
 
 APPBARDATA :: struct {
@@ -69,6 +73,8 @@ ABE_BOTTOM           :: 3
 KNOWNFOLDERID :: GUID
 REFKNOWNFOLDERID :: ^KNOWNFOLDERID
 
+HDROP :: HANDLE
+
 KNOWN_FOLDER_FLAG :: enum u32 {
 	DEFAULT                          = 0x00000000,
 
diff --git a/core/sys/windows/types.odin b/core/sys/windows/types.odin
index d63185c28..591041aed 100644
--- a/core/sys/windows/types.odin
+++ b/core/sys/windows/types.odin
@@ -1131,16 +1131,28 @@ TRACKMOUSEEVENT :: struct {
 }
 
 WIN32_FIND_DATAW :: struct {
-	dwFileAttributes: DWORD,
-	ftCreationTime: FILETIME,
-	ftLastAccessTime: FILETIME,
-	ftLastWriteTime: FILETIME,
-	nFileSizeHigh: DWORD,
-	nFileSizeLow: DWORD,
-	dwReserved0: DWORD,
-	dwReserved1: DWORD,
-	cFileName: [260]wchar_t, // #define MAX_PATH 260
-	cAlternateFileName: [14]wchar_t,
+	dwFileAttributes:   DWORD,
+	ftCreationTime:     FILETIME,
+	ftLastAccessTime:   FILETIME,
+	ftLastWriteTime:    FILETIME,
+	nFileSizeHigh:      DWORD,
+	nFileSizeLow:       DWORD,
+	dwReserved0:        DWORD,
+	dwReserved1:        DWORD,
+	cFileName:          [MAX_PATH]WCHAR,
+	cAlternateFileName: [14]WCHAR,
+	_OBSOLETE_dwFileType:    DWORD, // Obsolete. Do not use.
+	_OBSOLETE_dwCreatorType: DWORD, // Obsolete. Do not use
+	_OBSOLETE_wFinderFlags:  WORD,  // Obsolete. Do not use
+}
+
+FILE_ID_128 :: struct {
+	Identifier: [16]BYTE,
+}
+
+FILE_ID_INFO :: struct {
+	VolumeSerialNumber: ULONGLONG,
+	FileId:             FILE_ID_128,
 }
 
 CREATESTRUCTA :: struct {
@@ -1196,6 +1208,11 @@ NMHDR :: struct {
 	code:     UINT,      // NM_ code
 }
 
+NCCALCSIZE_PARAMS :: struct {
+	rgrc: [3]RECT,
+	lppos: PWINDOWPOS,
+}
+
 // Generic WM_NOTIFY notification codes
 NM_OUTOFMEMORY          :: ~uintptr(0) // -1
 NM_CLICK                :: NM_OUTOFMEMORY-1  // uses NMCLICK struct
@@ -2318,6 +2335,7 @@ FILE_TYPE_PIPE :: 0x0003
 RECT  :: struct {left, top, right, bottom: LONG}
 POINT :: struct {x, y: LONG}
 
+PWINDOWPOS :: ^WINDOWPOS
 WINDOWPOS :: struct {
 	hwnd: HWND,
 	hwndInsertAfter: HWND,
@@ -2549,6 +2567,7 @@ CLSCTX_RESERVED6                      :: 0x1000000
 CLSCTX_ACTIVATE_ARM32_SERVER          :: 0x2000000
 CLSCTX_ALLOW_LOWER_TRUST_REGISTRATION :: 0x4000000
 CLSCTX_PS_DLL                         :: 0x80000000
+CLSCTX_ALL                            :: CLSCTX_INPROC_SERVER | CLSCTX_INPROC_HANDLER | CLSCTX_LOCAL_SERVER | CLSCTX_REMOTE_SERVER
 
 WSAPROTOCOLCHAIN :: struct {
 	ChainLen: c_int,
@@ -2608,10 +2627,11 @@ OBJECT_ATTRIBUTES :: struct {
 	SecurityQualityOfService: rawptr,
 }
 
+PUNICODE_STRING :: ^UNICODE_STRING
 UNICODE_STRING :: struct {
-	Length:        u16,
-	MaximumLength: u16,
-	Buffer:        ^u16,
+	Length:        u16    `fmt:"-"`,
+	MaximumLength: u16    `fmt:"-"`,
+	Buffer:        [^]u16 `fmt:"s,Length"`,
 }
 
 OVERLAPPED :: struct {
@@ -2822,41 +2842,41 @@ NEON128 :: struct {
 
 EXCEPTION_POINTERS :: struct {
 	ExceptionRecord: ^EXCEPTION_RECORD,
-	ContextRecord: ^CONTEXT,
+	ContextRecord:   ^CONTEXT,
 }
 
 PVECTORED_EXCEPTION_HANDLER :: #type proc "system" (ExceptionInfo: ^EXCEPTION_POINTERS) -> LONG
 
 CONSOLE_READCONSOLE_CONTROL :: struct {
-	nLength: ULONG,
-	nInitialChars: ULONG,
-	dwCtrlWakeupMask: ULONG,
+	nLength:           ULONG,
+	nInitialChars:     ULONG,
+	dwCtrlWakeupMask:  ULONG,
 	dwControlKeyState: ULONG,
 }
 
 PCONSOLE_READCONSOLE_CONTROL :: ^CONSOLE_READCONSOLE_CONTROL
 
 BY_HANDLE_FILE_INFORMATION :: struct {
-	dwFileAttributes: DWORD,
-	ftCreationTime: FILETIME,
-	ftLastAccessTime: FILETIME,
-	ftLastWriteTime: FILETIME,
+	dwFileAttributes:     DWORD,
+	ftCreationTime:       FILETIME,
+	ftLastAccessTime:     FILETIME,
+	ftLastWriteTime:      FILETIME,
 	dwVolumeSerialNumber: DWORD,
-	nFileSizeHigh: DWORD,
-	nFileSizeLow: DWORD,
-	nNumberOfLinks: DWORD,
-	nFileIndexHigh: DWORD,
-	nFileIndexLow: DWORD,
+	nFileSizeHigh:        DWORD,
+	nFileSizeLow:         DWORD,
+	nNumberOfLinks:       DWORD,
+	nFileIndexHigh:       DWORD,
+	nFileIndexLow:        DWORD,
 }
 
 LPBY_HANDLE_FILE_INFORMATION :: ^BY_HANDLE_FILE_INFORMATION
 
 FILE_STANDARD_INFO :: struct {
 	AllocationSize: LARGE_INTEGER,
-	EndOfFile: LARGE_INTEGER,
-	NumberOfLinks: DWORD,
-	DeletePending: BOOLEAN,
-	Directory: BOOLEAN,
+	EndOfFile:      LARGE_INTEGER,
+	NumberOfLinks:  DWORD,
+	DeletePending:  BOOLEAN,
+	Directory:      BOOLEAN,
 }
 
 FILE_ATTRIBUTE_TAG_INFO :: struct {
diff --git a/core/testing/runner.odin b/core/testing/runner.odin
index fa7c2ffd2..da0328f91 100644
--- a/core/testing/runner.odin
+++ b/core/testing/runner.odin
@@ -6,6 +6,7 @@ import "base:runtime"
 import "core:bytes"
 import "core:encoding/ansi"
 @require import "core:encoding/base64"
+@require import "core:encoding/json"
 import "core:fmt"
 import "core:io"
 @require import pkg_log "core:log"
@@ -44,7 +45,8 @@ SHARED_RANDOM_SEED    : u64    : #config(ODIN_TEST_RANDOM_SEED, 0)
 LOG_LEVEL             : string : #config(ODIN_TEST_LOG_LEVEL, "info")
 // Show only the most necessary logging information.
 USING_SHORT_LOGS      : bool   : #config(ODIN_TEST_SHORT_LOGS, false)
-
+// Output a report of the tests to the given path.
+JSON_REPORT           : string : #config(ODIN_TEST_JSON_REPORT, "")
 
 get_log_level :: #force_inline proc() -> runtime.Logger_Level {
 	when ODIN_DEBUG {
@@ -61,6 +63,18 @@ get_log_level :: #force_inline proc() -> runtime.Logger_Level {
 	}
 }
 
+JSON :: struct {
+	total:    int,
+	success:  int,
+	duration: time.Duration,
+	packages: map[string][dynamic]JSON_Test,
+}
+
+JSON_Test :: struct {
+	success: bool,
+	name:    string,
+}
+
 end_t :: proc(t: ^T) {
 	for i := len(t.cleanups)-1; i >= 0; i -= 1 {
 		#no_bounds_check c := t.cleanups[i]
@@ -654,8 +668,8 @@ runner :: proc(internal_tests: []Internal_Test) -> bool {
 			#no_bounds_check pkg := report.packages_by_name[it.pkg]
 			pkg.frame_ready = false
 
-			fmt.assertf(thread.pool_stop_task(&pool, test_index),
-				"A signal (%v) was raised to stop test #%i %s.%s, but it was unable to be found.",
+			found := thread.pool_stop_task(&pool, test_index)
+			fmt.assertf(found, "A signal (%v) was raised to stop test #%i %s.%s, but it was unable to be found.",
 				reason, test_index, it.pkg, it.name)
 
 			// The order this is handled in is a little particular.
@@ -847,5 +861,35 @@ To partly mitigate this, redirect STDERR to a file or use the -define:ODIN_TEST_
 
 	fmt.wprintln(stderr, bytes.buffer_to_string(&batch_buffer))
 
+	when JSON_REPORT != "" {
+		json_report: JSON
+
+		mode: int
+		when ODIN_OS != .Windows {
+			mode = os.S_IRUSR|os.S_IWUSR|os.S_IRGRP|os.S_IROTH
+		}
+		json_fd, errno := os.open(JSON_REPORT, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, mode)
+		fmt.assertf(errno == os.ERROR_NONE, "unable to open file %q for writing of JSON report, error: %v", JSON_REPORT, errno)
+		defer os.close(json_fd)
+
+		for test, i in report.all_tests {
+			#no_bounds_check state := report.all_test_states[i]
+
+			if test.pkg not_in json_report.packages {
+				json_report.packages[test.pkg] = {}
+			}
+
+			tests := &json_report.packages[test.pkg]
+			append(tests, JSON_Test{name = test.name, success = state == .Successful})
+		}
+
+		json_report.total    = len(internal_tests)
+		json_report.success  = total_success_count
+		json_report.duration = finished_in
+
+		err := json.marshal_to_writer(os.stream_from_handle(json_fd), json_report, &{ pretty = true })
+		fmt.assertf(err == nil, "Error writing JSON report: %v", err)
+	}
+
 	return total_success_count == total_test_count
 }
diff --git a/core/thread/thread.odin b/core/thread/thread.odin
index 80e60d6cf..17ba1a0a2 100644
--- a/core/thread/thread.odin
+++ b/core/thread/thread.odin
@@ -6,12 +6,26 @@ import "base:intrinsics"
 
 _ :: intrinsics
 
+/*
+Value, specifying whether `core:thread` functionality is available on the
+current platform.
+*/
 IS_SUPPORTED :: _IS_SUPPORTED
 
+/*
+Type for a procedure that will be run in a thread, after that thread has been
+started.
+*/
 Thread_Proc :: #type proc(^Thread)
 
+/*
+Maximum number of user arguments for polymorphic thread procedures.
+*/
 MAX_USER_ARGUMENTS :: 8
 
+/*
+Type representing the state/flags of the thread.
+*/
 Thread_State :: enum u8 {
 	Started,
 	Joined,
@@ -19,44 +33,48 @@ Thread_State :: enum u8 {
 	Self_Cleanup,
 }
 
+/*
+Type representing a thread handle and the associated with that thread data.
+*/
 Thread :: struct {
 	using specific: Thread_Os_Specific,
 	flags: bit_set[Thread_State; u8],
-	id:             int,
-	procedure:      Thread_Proc,
-
-	/*
-		These are values that the user can set as they wish, after the thread has been created.
-		This data is easily available to the thread proc.
-
-		These fields can be assigned to directly.
-
-		Should be set after the thread is created, but before it is started.
-	*/
-	data:           rawptr,
-	user_index:     int,
-	user_args:      [MAX_USER_ARGUMENTS]rawptr,
-
-	/*
-		The context to be used as 'context' in the thread proc.
-
-		This field can be assigned to directly, after the thread has been created, but __before__ the thread has been started.
-		This field must not be changed after the thread has started.
-
-		NOTE: If you __don't__ set this, the temp allocator will be managed for you;
-		      If you __do__ set this, then you're expected to handle whatever allocators you set, yourself.
-
-		IMPORTANT:
-		By default, the thread proc will get the same context as `main()` gets.
-		In this situation, the thread will get a new temporary allocator which will be cleaned up when the thread dies.
-		***This does NOT happen when you set `init_context`.***
-		This means that if you set `init_context`, but still have the `temp_allocator` field set to the default temp allocator,
-		then you'll need to call `runtime.default_temp_allocator_destroy(auto_cast the_thread.init_context.temp_allocator.data)` manually,
-		in order to prevent any memory leaks.
-		This call ***must*** be done ***in the thread proc*** because the default temporary allocator uses thread local state!
-	*/
+	// Thread ID.
+	id: int,
+	// The thread procedure.
+	procedure: Thread_Proc,
+	// User-supplied pointer, that will be available to the thread once it is
+	// started. Should be set after the thread has been created, but before
+	// it is started.
+	data: rawptr,
+	// User-supplied integer, that will be available to the thread once it is
+	// started. Should be set after the thread has been created, but before
+	// it is started.
+	user_index: int,
+	// User-supplied array of arguments, that will be available to the thread,
+	// once it is started. Should be set after the thread has been created,
+	// but before it is started.
+	user_args: [MAX_USER_ARGUMENTS]rawptr,
+	// The thread context.
+	// This field can be assigned to directly, after the thread has been
+	// created, but __before__ the thread has been started. This field must
+	// not be changed after the thread has started.
+	//
+	// **Note**: If this field is **not** set, the temp allocator will be managed
+	// automatically. If it is set, the allocators must be handled manually.
+	//
+	// **IMPORTANT**:
+	// By default, the thread proc will get the same context as `main()` gets.
+	// In this situation, the thread will get a new temporary allocator which
+	// will be cleaned up when the thread dies. ***This does NOT happen when
+	// `init_context` field is initialized***.
+	//
+	// If `init_context` is initialized, and `temp_allocator` field is set to
+	// the default temp allocator, then `runtime.default_temp_allocator_destroy()`
+	// procedure needs to be called from the thread procedure, in order to prevent
+	// any memory leaks.
 	init_context: Maybe(runtime.Context),
-
+	// The allocator used to allocate data for the thread.
 	creation_allocator: mem.Allocator,
 }
 
@@ -64,6 +82,9 @@ when IS_SUPPORTED {
 	#assert(size_of(Thread{}.user_index) == size_of(uintptr))
 }
 
+/*
+Type representing priority of a thread.
+*/
 Thread_Priority :: enum {
 	Normal,
 	Low,
@@ -71,74 +92,178 @@ Thread_Priority :: enum {
 }
 
 /*
-	Creates a thread in a suspended state with the given priority.
-	To start the thread, call `thread.start()`.
+Create a thread in a suspended state with the given priority.
 
-	See `thread.create_and_start()`.
+This procedure creates a thread that will be set to run the procedure
+specified by `procedure` parameter with a specified priority. The returned
+thread will be in a suspended state, until `start()` procedure is called.
+
+To start the thread, call `start()`. Also the `create_and_start()`
+procedure can be called to create and start the thread immediately.
 */
 create :: proc(procedure: Thread_Proc, priority := Thread_Priority.Normal) -> ^Thread {
 	return _create(procedure, priority)
 }
+
+/*
+Wait for the thread to finish and free all data associated with it.
+*/
 destroy :: proc(thread: ^Thread) {
 	_destroy(thread)
 }
 
+/*
+Start a suspended thread.
+*/
 start :: proc(thread: ^Thread) {
 	_start(thread)
 }
 
+/*
+Check if the thread has finished work.
+*/
 is_done :: proc(thread: ^Thread) -> bool {
 	return _is_done(thread)
 }
 
-
+/*
+Wait for the thread to finish work.
+*/
 join :: proc(thread: ^Thread) {
 	_join(thread)
 }
 
-
+/*
+Wait for all threads to finish work.
+*/
 join_multiple :: proc(threads: ..^Thread) {
 	_join_multiple(..threads)
 }
 
+/*
+Forcibly terminate a running thread.
+*/
 terminate :: proc(thread: ^Thread, exit_code: int) {
 	_terminate(thread, exit_code)
 }
 
+/*
+Yield the execution of the current thread to another OS thread or process.
+*/
 yield :: proc() {
 	_yield()
 }
 
+/*
+Run a procedure on a different thread.
 
+This procedure runs the given procedure on another thread. The context
+specified by `init_context` will be used as the context in which `fn` is going
+to execute. The thread will have priority specified by the `priority` parameter.
 
+**IMPORTANT**: If `init_context` is specified and the default temporary allocator
+is used, the thread procedure needs to call `runtime.default_temp_allocator_destroy()`
+in order to free the resources associated with the temporary allocations.
+*/
 run :: proc(fn: proc(), init_context: Maybe(runtime.Context) = nil, priority := Thread_Priority.Normal) {
 	create_and_start(fn, init_context, priority, true)
 }
 
+/*
+Run a procedure with one pointer parameter on a different thread.
+
+This procedure runs the given procedure on another thread. The context
+specified by `init_context` will be used as the context in which `fn` is going
+to execute. The thread will have priority specified by the `priority` parameter.
+
+**IMPORTANT**: If `init_context` is specified and the default temporary allocator
+is used, the thread procedure needs to call `runtime.default_temp_allocator_destroy()`
+in order to free the resources associated with the temporary allocations.
+*/
 run_with_data :: proc(data: rawptr, fn: proc(data: rawptr), init_context: Maybe(runtime.Context) = nil, priority := Thread_Priority.Normal) {
 	create_and_start_with_data(data, fn, init_context, priority, true)
 }
 
+/*
+Run a procedure with one polymorphic parameter on a different thread.
+
+This procedure runs the given procedure on another thread. The context
+specified by `init_context` will be used as the context in which `fn` is going
+to execute. The thread will have priority specified by the `priority` parameter.
+
+**IMPORTANT**: If `init_context` is specified and the default temporary allocator
+is used, the thread procedure needs to call `runtime.default_temp_allocator_destroy()`
+in order to free the resources associated with the temporary allocations.
+*/
 run_with_poly_data :: proc(data: $T, fn: proc(data: T), init_context: Maybe(runtime.Context) = nil, priority := Thread_Priority.Normal)
 	where size_of(T) <= size_of(rawptr) * MAX_USER_ARGUMENTS {
 	create_and_start_with_poly_data(data, fn, init_context, priority, true)
 }
 
+/*
+Run a procedure with two polymorphic parameters on a different thread.
+
+This procedure runs the given procedure on another thread. The context
+specified by `init_context` will be used as the context in which `fn` is going
+to execute. The thread will have priority specified by the `priority` parameter.
+
+**IMPORTANT**: If `init_context` is specified and the default temporary allocator
+is used, the thread procedure needs to call `runtime.default_temp_allocator_destroy()`
+in order to free the resources associated with the temporary allocations.
+*/
 run_with_poly_data2 :: proc(arg1: $T1, arg2: $T2, fn: proc(T1, T2), init_context: Maybe(runtime.Context) = nil, priority := Thread_Priority.Normal)
 	where size_of(T1) + size_of(T2) <= size_of(rawptr) * MAX_USER_ARGUMENTS {
 	create_and_start_with_poly_data2(arg1, arg2, fn, init_context, priority, true)
 }
 
+/*
+Run a procedure with three polymorphic parameters on a different thread.
+
+This procedure runs the given procedure on another thread. The context
+specified by `init_context` will be used as the context in which `fn` is going
+to execute. The thread will have priority specified by the `priority` parameter.
+
+**IMPORTANT**: If `init_context` is specified and the default temporary allocator
+is used, the thread procedure needs to call `runtime.default_temp_allocator_destroy()`
+in order to free the resources associated with the temporary allocations.
+*/
 run_with_poly_data3 :: proc(arg1: $T1, arg2: $T2, arg3: $T3, fn: proc(arg1: T1, arg2: T2, arg3: T3), init_context: Maybe(runtime.Context) = nil, priority := Thread_Priority.Normal)
 	where size_of(T1) + size_of(T2) + size_of(T3) <= size_of(rawptr) * MAX_USER_ARGUMENTS {
 	create_and_start_with_poly_data3(arg1, arg2, arg3, fn, init_context, priority, true)
 }
+
+/*
+Run a procedure with four polymorphic parameters on a different thread.
+
+This procedure runs the given procedure on another thread. The context
+specified by `init_context` will be used as the context in which `fn` is going
+to execute. The thread will have priority specified by the `priority` parameter.
+
+**IMPORTANT**: If `init_context` is specified and the default temporary allocator
+is used, the thread procedure needs to call `runtime.default_temp_allocator_destroy()`
+in order to free the resources associated with the temporary allocations.
+*/
 run_with_poly_data4 :: proc(arg1: $T1, arg2: $T2, arg3: $T3, arg4: $T4, fn: proc(arg1: T1, arg2: T2, arg3: T3, arg4: T4), init_context: Maybe(runtime.Context) = nil, priority := Thread_Priority.Normal)
 	where size_of(T1) + size_of(T2) + size_of(T3) + size_of(T4) <= size_of(rawptr) * MAX_USER_ARGUMENTS {
 	create_and_start_with_poly_data4(arg1, arg2, arg3, arg4, fn, init_context, priority, true)
 }
 
+/*
+Run a procedure on a different thread.
 
+This procedure runs the given procedure on another thread. The context
+specified by `init_context` will be used as the context in which `fn` is going
+to execute. The thread will have priority specified by the `priority` parameter.
+
+If `self_cleanup` is specified, after the thread finishes the execution of the
+`fn` procedure, the resources associated with the thread are going to be
+automatically freed. **Do not** dereference the `^Thread` pointer, if this
+flag is specified.
+
+**IMPORTANT**: If `init_context` is specified and the default temporary allocator
+is used, the thread procedure needs to call `runtime.default_temp_allocator_destroy()`
+in order to free the resources associated with the temporary allocations.
+*/
 create_and_start :: proc(fn: proc(), init_context: Maybe(runtime.Context) = nil, priority := Thread_Priority.Normal, self_cleanup := false) -> ^Thread {
 	thread_proc :: proc(t: ^Thread) {
 		fn := cast(proc())t.data
@@ -154,9 +279,22 @@ create_and_start :: proc(fn: proc(), init_context: Maybe(runtime.Context) = nil,
 	return t
 }
 
+/*
+Run a procedure with one pointer parameter on a different thread.
 
+This procedure runs the given procedure on another thread. The context
+specified by `init_context` will be used as the context in which `fn` is going
+to execute. The thread will have priority specified by the `priority` parameter.
 
+If `self_cleanup` is specified, after the thread finishes the execution of the
+`fn` procedure, the resources associated with the thread are going to be
+automatically freed. **Do not** dereference the `^Thread` pointer, if this
+flag is specified.
 
+**IMPORTANT**: If `init_context` is specified and the default temporary allocator
+is used, the thread procedure needs to call `runtime.default_temp_allocator_destroy()`
+in order to free the resources associated with the temporary allocations.
+*/
 create_and_start_with_data :: proc(data: rawptr, fn: proc(data: rawptr), init_context: Maybe(runtime.Context) = nil, priority := Thread_Priority.Normal, self_cleanup := false) -> ^Thread {
 	thread_proc :: proc(t: ^Thread) {
 		fn := cast(proc(rawptr))t.data
@@ -176,6 +314,22 @@ create_and_start_with_data :: proc(data: rawptr, fn: proc(data: rawptr), init_co
 	return t
 }
 
+/*
+Run a procedure with one polymorphic parameter on a different thread.
+
+This procedure runs the given procedure on another thread. The context
+specified by `init_context` will be used as the context in which `fn` is going
+to execute. The thread will have priority specified by the `priority` parameter.
+
+If `self_cleanup` is specified, after the thread finishes the execution of the
+`fn` procedure, the resources associated with the thread are going to be
+automatically freed. **Do not** dereference the `^Thread` pointer, if this
+flag is specified.
+
+**IMPORTANT**: If `init_context` is specified and the default temporary allocator
+is used, the thread procedure needs to call `runtime.default_temp_allocator_destroy()`
+in order to free the resources associated with the temporary allocations.
+*/
 create_and_start_with_poly_data :: proc(data: $T, fn: proc(data: T), init_context: Maybe(runtime.Context) = nil, priority := Thread_Priority.Normal, self_cleanup := false) -> ^Thread
 	where size_of(T) <= size_of(rawptr) * MAX_USER_ARGUMENTS {
 	thread_proc :: proc(t: ^Thread) {
@@ -201,6 +355,22 @@ create_and_start_with_poly_data :: proc(data: $T, fn: proc(data: T), init_contex
 	return t
 }
 
+/*
+Run a procedure with two polymorphic parameters on a different thread.
+
+This procedure runs the given procedure on another thread. The context
+specified by `init_context` will be used as the context in which `fn` is going
+to execute. The thread will have priority specified by the `priority` parameter.
+
+If `self_cleanup` is specified, after the thread finishes the execution of the
+`fn` procedure, the resources associated with the thread are going to be
+automatically freed. **Do not** dereference the `^Thread` pointer, if this
+flag is specified.
+
+**IMPORTANT**: If `init_context` is specified and the default temporary allocator
+is used, the thread procedure needs to call `runtime.default_temp_allocator_destroy()`
+in order to free the resources associated with the temporary allocations.
+*/
 create_and_start_with_poly_data2 :: proc(arg1: $T1, arg2: $T2, fn: proc(T1, T2), init_context: Maybe(runtime.Context) = nil, priority := Thread_Priority.Normal, self_cleanup := false) -> ^Thread
 	where size_of(T1) + size_of(T2) <= size_of(rawptr) * MAX_USER_ARGUMENTS {
 	thread_proc :: proc(t: ^Thread) {
@@ -232,6 +402,22 @@ create_and_start_with_poly_data2 :: proc(arg1: $T1, arg2: $T2, fn: proc(T1, T2),
 	return t
 }
 
+/*
+Run a procedure with three polymorphic parameters on a different thread.
+
+This procedure runs the given procedure on another thread. The context
+specified by `init_context` will be used as the context in which `fn` is going
+to execute. The thread will have priority specified by the `priority` parameter.
+
+If `self_cleanup` is specified, after the thread finishes the execution of the
+`fn` procedure, the resources associated with the thread are going to be
+automatically freed. **Do not** dereference the `^Thread` pointer, if this
+flag is specified.
+
+**IMPORTANT**: If `init_context` is specified and the default temporary allocator
+is used, the thread procedure needs to call `runtime.default_temp_allocator_destroy()`
+in order to free the resources associated with the temporary allocations.
+*/
 create_and_start_with_poly_data3 :: proc(arg1: $T1, arg2: $T2, arg3: $T3, fn: proc(arg1: T1, arg2: T2, arg3: T3), init_context: Maybe(runtime.Context) = nil, priority := Thread_Priority.Normal, self_cleanup := false) -> ^Thread
 	where size_of(T1) + size_of(T2) + size_of(T3) <= size_of(rawptr) * MAX_USER_ARGUMENTS {
 	thread_proc :: proc(t: ^Thread) {
@@ -264,6 +450,23 @@ create_and_start_with_poly_data3 :: proc(arg1: $T1, arg2: $T2, arg3: $T3, fn: pr
 	start(t)
 	return t
 }
+
+/*
+Run a procedure with four polymorphic parameters on a different thread.
+
+This procedure runs the given procedure on another thread. The context
+specified by `init_context` will be used as the context in which `fn` is going
+to execute. The thread will have priority specified by the `priority` parameter.
+
+If `self_cleanup` is specified, after the thread finishes the execution of the
+`fn` procedure, the resources associated with the thread are going to be
+automatically freed. **Do not** dereference the `^Thread` pointer, if this
+flag is specified.
+
+**IMPORTANT**: If `init_context` is specified and the default temporary allocator
+is used, the thread procedure needs to call `runtime.default_temp_allocator_destroy()`
+in order to free the resources associated with the temporary allocations.
+*/
 create_and_start_with_poly_data4 :: proc(arg1: $T1, arg2: $T2, arg3: $T3, arg4: $T4, fn: proc(arg1: T1, arg2: T2, arg3: T3, arg4: T4), init_context: Maybe(runtime.Context) = nil, priority := Thread_Priority.Normal, self_cleanup := false) -> ^Thread
 	where size_of(T1) + size_of(T2) + size_of(T3) + size_of(T4) <= size_of(rawptr) * MAX_USER_ARGUMENTS {
 	thread_proc :: proc(t: ^Thread) {
diff --git a/core/thread/thread_unix.odin b/core/thread/thread_unix.odin
index 363f50862..f56454bfc 100644
--- a/core/thread/thread_unix.odin
+++ b/core/thread/thread_unix.odin
@@ -81,9 +81,12 @@ _create :: proc(procedure: Thread_Proc, priority: Thread_Priority) -> ^Thread {
 	defer unix.pthread_attr_destroy(&attrs)
 
 	// NOTE(tetra, 2019-11-01): These only fail if their argument is invalid.
-	assert(unix.pthread_attr_setdetachstate(&attrs, unix.PTHREAD_CREATE_JOINABLE) == 0)
+	res: i32
+	res = unix.pthread_attr_setdetachstate(&attrs, unix.PTHREAD_CREATE_JOINABLE)
+	assert(res == 0)
 	when ODIN_OS != .Haiku && ODIN_OS != .NetBSD {
-		assert(unix.pthread_attr_setinheritsched(&attrs, unix.PTHREAD_EXPLICIT_SCHED) == 0)
+		res = unix.pthread_attr_setinheritsched(&attrs, unix.PTHREAD_EXPLICIT_SCHED)
+		assert(res == 0)
 	}
 
 	thread := new(Thread)
@@ -94,7 +97,6 @@ _create :: proc(procedure: Thread_Proc, priority: Thread_Priority) -> ^Thread {
 
 	// Set thread priority.
 	policy: i32
-	res: i32
 	when ODIN_OS != .Haiku && ODIN_OS != .NetBSD {
 		res = unix.pthread_attr_getschedpolicy(&attrs, &policy)
 		assert(res == 0)
diff --git a/core/time/datetime/constants.odin b/core/time/datetime/constants.odin
index a2a02838c..5f336ef4a 100644
--- a/core/time/datetime/constants.odin
+++ b/core/time/datetime/constants.odin
@@ -1,16 +1,46 @@
 package datetime
 
-// Ordinal 1 = Midnight Monday, January 1, 1 A.D. (Gregorian)
-//         |   Midnight Monday, January 3, 1 A.D. (Julian)
+/*
+Type representing a mononotic day number corresponding to a date.
+
+	Ordinal 1 = Midnight Monday, January 1, 1 A.D. (Gregorian)
+	        |   Midnight Monday, January 3, 1 A.D. (Julian)
+*/
 Ordinal :: i64
+
+/*
+*/
 EPOCH   :: Ordinal(1)
 
-// Minimum and maximum dates and ordinals. Chosen for safe roundtripping.
+/*
+Minimum valid value for date.
+
+The value is chosen such that a conversion `date -> ordinal -> date` is always
+safe.
+*/
 MIN_DATE :: Date{year = -25_252_734_927_766_552, month =  1, day =  1}
+
+/*
+Maximum valid value for date
+
+The value is chosen such that a conversion `date -> ordinal -> date` is always
+safe.
+*/
 MAX_DATE :: Date{year =  25_252_734_927_766_552, month = 12, day = 31}
+
+/*
+Minimum value for an ordinal
+*/
 MIN_ORD  :: Ordinal(-9_223_372_036_854_775_234)
+
+/*
+Maximum value for an ordinal
+*/
 MAX_ORD  :: Ordinal( 9_223_372_036_854_774_869)
 
+/*
+Possible errors returned by datetime functions.
+*/
 Error :: enum {
 	None,
 	Invalid_Year,
@@ -24,12 +54,22 @@ Error :: enum {
 	Invalid_Delta,
 }
 
+/*
+A type representing a date.
+
+The minimum and maximum values for a year can be found in `MIN_DATE` and
+`MAX_DATE` constants. The `month` field can range from 1 to 12, and the day
+ranges from 1 to however many days there are in the specified month.
+*/
 Date :: struct {
 	year:   i64,
 	month:  i8,
 	day:    i8,
 }
 
+/*
+A type representing a time within a single day within a nanosecond precision.
+*/
 Time :: struct {
 	hour:   i8,
 	minute: i8,
@@ -37,17 +77,30 @@ Time :: struct {
 	nano:   i32,
 }
 
+/*
+A type representing datetime.
+*/
 DateTime :: struct {
 	using date: Date,
 	using time: Time,
 }
 
+/*
+A type representing a difference between two instances of datetime.
+
+**Note**: All fields are i64 because we can also use it to add a number of
+seconds or nanos to a moment, that are then normalized within their respective
+ranges.
+*/
 Delta :: struct {
-	days:    i64, // These are all i64 because we can also use it to add a number of seconds or nanos to a moment,
-	seconds: i64, // that are then normalized within their respective ranges.
+	days:    i64, 
+	seconds: i64, 
 	nanos:   i64,
 }
 
+/*
+Type representing one of the months.
+*/
 Month :: enum i8 {
 	January = 1,
 	February,
@@ -63,6 +116,9 @@ Month :: enum i8 {
 	December,
 }
 
+/*
+Type representing one of the weekdays.
+*/
 Weekday :: enum i8 {
 	Sunday = 0,
 	Monday,
diff --git a/core/time/datetime/datetime.odin b/core/time/datetime/datetime.odin
index 938b4a368..fc9780e3b 100644
--- a/core/time/datetime/datetime.odin
+++ b/core/time/datetime/datetime.odin
@@ -1,56 +1,113 @@
 /*
-	Calendrical conversions using a proleptic Gregorian calendar.
+Calendrical conversions using a proleptic Gregorian calendar.
 
-	Implemented using formulas from: Calendrical Calculations Ultimate Edition, Reingold & Dershowitz
+Implemented using formulas from: Calendrical Calculations Ultimate Edition,
+Reingold & Dershowitz
 */
 package datetime
 
 import "base:intrinsics"
 
-// Procedures that return an Ordinal
+/*
+Obtain an ordinal from a date.
 
+This procedure converts the specified date into an ordinal. If the specified
+date is not a valid date, an error is returned.
+*/
 date_to_ordinal :: proc "contextless" (date: Date) -> (ordinal: Ordinal, err: Error) {
 	validate(date) or_return
 	return unsafe_date_to_ordinal(date), .None
 }
 
+/*
+Obtain an ordinal from date components.
+
+This procedure converts the specified date, provided by its individual
+components, into an ordinal. If the specified date is not a valid date, an error
+is returned.
+*/
 components_to_ordinal :: proc "contextless" (#any_int year, #any_int month, #any_int day: i64) -> (ordinal: Ordinal, err: Error) {
 	validate(year, month, day) or_return
 	return unsafe_date_to_ordinal({year, i8(month), i8(day)}), .None
 }
 
-// Procedures that return a Date
+/*
+Obtain date using an Ordinal.
 
+This provedure converts the specified ordinal into a date. If the ordinal is not
+a valid ordinal, an error is returned.
+*/
 ordinal_to_date :: proc "contextless" (ordinal: Ordinal) -> (date: Date, err: Error) {
 	validate(ordinal) or_return
 	return unsafe_ordinal_to_date(ordinal), .None
 }
 
+/*
+Obtain a date from date components.
+
+This procedure converts date components, specified by a year, a month and a day,
+into a date object. If the provided date components don't represent a valid
+date, an error is returned.
+*/
 components_to_date :: proc "contextless" (#any_int year, #any_int month, #any_int day: i64) -> (date: Date, err: Error) {
 	validate(year, month, day) or_return
 	return Date{i64(year), i8(month), i8(day)}, .None
 }
 
+/*
+Obtain time from time components.
+
+This procedure converts time components, specified by an hour, a minute, a second
+and nanoseconds, into a time object. If the provided time components don't
+represent a valid time, an error is returned.
+*/
 components_to_time :: proc "contextless" (#any_int hour, #any_int minute, #any_int second: i64, #any_int nanos := i64(0)) -> (time: Time, err: Error) {
 	validate(hour, minute, second, nanos) or_return
 	return Time{i8(hour), i8(minute), i8(second), i32(nanos)}, .None
 }
 
+/*
+Obtain datetime from components.
+
+This procedure converts date components and time components into a datetime object.
+If the provided date components or time components don't represent a valid
+datetime, an error is returned.
+*/
 components_to_datetime :: proc "contextless" (#any_int year, #any_int month, #any_int day, #any_int hour, #any_int minute, #any_int second: i64, #any_int nanos := i64(0)) -> (datetime: DateTime, err: Error) {
 	date := components_to_date(year, month, day)            or_return
 	time := components_to_time(hour, minute, second, nanos) or_return
 	return {date, time}, .None
 }
 
+/*
+Obtain an datetime from an ordinal.
+
+This procedure converts the value of an ordinal into a datetime. Since the
+ordinal only has the amount of days, the resulting time in the datetime
+object will always have the time equal to `00:00:00.000`.
+*/
 ordinal_to_datetime :: proc "contextless" (ordinal: Ordinal) -> (datetime: DateTime, err: Error) {
 	d := ordinal_to_date(ordinal) or_return
 	return {Date(d), {}}, .None
 }
 
+/*
+Calculate the weekday from an ordinal.
+
+This procedure takes the value of an ordinal and returns the day of week for
+that ordinal.
+*/
 day_of_week :: proc "contextless" (ordinal: Ordinal) -> (day: Weekday) {
 	return Weekday((ordinal - EPOCH + 1) %% 7)
 }
 
+/*
+Calculate the difference between two dates.
+
+This procedure calculates the difference between two dates `a - b`, and returns
+a delta between the two dates in `days`. If either `a` or `b` is not a valid
+date, an error is returned.
+*/
 subtract_dates :: proc "contextless" (a, b: Date) -> (delta: Delta, err: Error) {
 	ord_a := date_to_ordinal(a) or_return
 	ord_b := date_to_ordinal(b) or_return
@@ -59,6 +116,16 @@ subtract_dates :: proc "contextless" (a, b: Date) -> (delta: Delta, err: Error)
 	return
 }
 
+/*
+Calculate the difference between two datetimes.
+
+This procedure calculates the difference between two datetimes, `a - b`, and
+returns a delta between the two dates. The difference is returned in all three
+fields of the `Delta` struct: the difference in days, the difference in seconds
+and the difference in nanoseconds.
+
+If either `a` or `b` is not a valid datetime, an error is returned.
+*/
 subtract_datetimes :: proc "contextless" (a, b: DateTime) -> (delta: Delta, err: Error) {
 	ord_a := date_to_ordinal(a) or_return
 	ord_b := date_to_ordinal(b) or_return
@@ -73,19 +140,42 @@ subtract_datetimes :: proc "contextless" (a, b: DateTime) -> (delta: Delta, err:
 	return
 }
 
+/*
+Calculate a difference between two deltas.
+*/
 subtract_deltas :: proc "contextless" (a, b: Delta) -> (delta: Delta, err: Error) {
 	delta = Delta{a.days - b.days, a.seconds - b.seconds, a.nanos - b.nanos}
 	delta = normalize_delta(delta) or_return
 	return
 }
+
+/*
+Calculate a difference between two datetimes, dates or deltas.
+*/
 sub :: proc{subtract_datetimes, subtract_dates, subtract_deltas}
 
+/*
+Add certain amount of days to a date.
+
+This procedure adds the specified amount of days to a date and returns a new
+date. The new date would have happened the specified amount of days after the
+specified date.
+*/
 add_days_to_date :: proc "contextless" (a: Date, days: i64) -> (date: Date, err: Error) {
 	ord := date_to_ordinal(a) or_return
 	ord += days
 	return ordinal_to_date(ord)
 }
 
+/*
+Add delta to a date.
+
+This procedure adds a delta to a date, and returns a new date. The new date
+would have happened the time specified by `delta` after the specified date.
+
+**Note**: The delta is assumed to be normalized. That is, if it contains seconds
+or milliseconds, regardless of the amount only the days will be added.
+*/
 add_delta_to_date :: proc "contextless" (a: Date, delta: Delta) -> (date: Date, err: Error) {
 	ord := date_to_ordinal(a) or_return
 	// Because the input is a Date, we add only the days from the Delta.
@@ -93,6 +183,13 @@ add_delta_to_date :: proc "contextless" (a: Date, delta: Delta) -> (date: Date,
 	return ordinal_to_date(ord)
 }
 
+/*
+Add delta to datetime.
+
+This procedure adds a delta to a datetime, and returns a new datetime. The new
+datetime would have happened the time specified by `delta` after the specified
+datetime. 
+*/
 add_delta_to_datetime :: proc "contextless" (a: DateTime, delta: Delta) -> (datetime: DateTime, err: Error) {
 	days   := date_to_ordinal(a) or_return
 
@@ -110,8 +207,18 @@ add_delta_to_datetime :: proc "contextless" (a: DateTime, delta: Delta) -> (date
 	datetime.time = components_to_time(hour, minute, second, sum_delta.nanos) or_return
 	return
 }
+
+/*
+Add days to a date, delta to a date or delta to datetime.
+*/
 add :: proc{add_days_to_date, add_delta_to_date, add_delta_to_datetime}
 
+/*
+Obtain the day number in a year
+
+This procedure returns the number of the day in a year, starting from 1. If
+the date is not a valid date, an error is returned.
+*/
 day_number :: proc "contextless" (date: Date) -> (day_number: i64, err: Error) {
 	validate(date) or_return
 
@@ -120,6 +227,13 @@ day_number :: proc "contextless" (date: Date) -> (day_number: i64, err: Error) {
 	return
 }
 
+/*
+Obtain the remaining number of days in a year.
+
+This procedure returns the number of days between the specified date and
+December 31 of the same year. If the date is not a valid date, an error is
+returned.
+*/
 days_remaining :: proc "contextless" (date: Date) -> (days_remaining: i64, err: Error) {
 	// Alternative formulation `day_number` subtracted from 365 or 366 depending on leap year
 	validate(date) or_return
@@ -127,6 +241,12 @@ days_remaining :: proc "contextless" (date: Date) -> (days_remaining: i64, err:
 	return delta.days, .None
 }
 
+/*
+Obtain the last day of a given month on a given year.
+
+This procedure returns the amount of days in a specified month on a specified
+date. If the specified year or month is not valid, an error is returned.
+*/
 last_day_of_month :: proc "contextless" (#any_int year: i64, #any_int month: i8) -> (day: i8, err: Error) {
 	// Not using formula 2.27 from the book. This is far simpler and gives the same answer.
 
@@ -140,16 +260,33 @@ last_day_of_month :: proc "contextless" (#any_int year: i64, #any_int month: i8)
 	return
 }
 
+/*
+Obtain the new year date of a given year.
+
+This procedure returns the January 1st date of the specified year. If the year
+is not valid, an error is returned.
+*/
 new_year :: proc "contextless" (#any_int year: i64) -> (new_year: Date, err: Error) {
 	validate(year, 1, 1) or_return
 	return {year, 1, 1}, .None
 }
 
+/*
+Obtain the end year of a given date.
+
+This procedure returns the December 31st date of the specified year. If the year
+is not valid, an error is returned.
+*/
 year_end :: proc "contextless" (#any_int year: i64) -> (year_end: Date, err: Error) {
 	validate(year, 12, 31) or_return
 	return {year, 12, 31}, .None
 }
 
+/*
+Obtain the range of dates for a given year.
+
+This procedure returns dates, for every day of a given year in a slice.
+*/
 year_range :: proc (#any_int year: i64, allocator := context.allocator) -> (range: []Date) {
 	is_leap := is_leap_year(year)
 
@@ -171,6 +308,15 @@ year_range :: proc (#any_int year: i64, allocator := context.allocator) -> (rang
 	return
 }
 
+/*
+Normalize the delta.
+
+This procedure normalizes the delta in such a way that the number of seconds
+is between 0 and the number of seconds in the day and nanoseconds is between
+0 and 10^9.
+
+If the value for `days` overflows during this operation, an error is returned.
+*/
 normalize_delta :: proc "contextless" (delta: Delta) -> (normalized: Delta, err: Error) {
 	// Distribute nanos into seconds and remainder
 	seconds, nanos := divmod(delta.nanos, 1e9)
@@ -194,6 +340,12 @@ normalize_delta :: proc "contextless" (delta: Delta) -> (normalized: Delta, err:
 // The following procedures don't check whether their inputs are in a valid range.
 // They're still exported for those who know their inputs have been validated.
 
+/*
+Obtain an ordinal from a date.
+
+This procedure converts a date into an ordinal. If the date is not a valid date,
+the result is unspecified.
+*/
 unsafe_date_to_ordinal :: proc "contextless" (date: Date) -> (ordinal: Ordinal) {
 	year_minus_one := date.year - 1
 
@@ -223,6 +375,12 @@ unsafe_date_to_ordinal :: proc "contextless" (date: Date) -> (ordinal: Ordinal)
 	return
 }
 
+/*
+Obtain a year and a day of the year from an ordinal.
+
+This procedure returns the year and the day of the year of a given ordinal.
+Of the ordinal is outside of its valid range, the result is unspecified.
+*/
 unsafe_ordinal_to_year :: proc "contextless" (ordinal: Ordinal) -> (year: i64, day_ordinal: i64) {
 	// Days after epoch
 	d0   := ordinal - EPOCH
@@ -253,6 +411,12 @@ unsafe_ordinal_to_year :: proc "contextless" (ordinal: Ordinal) -> (year: i64, d
 	return year + 1, day_ordinal
 }
 
+/*
+Obtain a date from an ordinal.
+
+This procedure converts an ordinal into a date. If the ordinal is outside of
+its valid range, the result is unspecified.
+*/
 unsafe_ordinal_to_date :: proc "contextless" (ordinal: Ordinal) -> (date: Date) {
 	year, _ := unsafe_ordinal_to_year(ordinal)
 
diff --git a/core/time/datetime/internal.odin b/core/time/datetime/internal.odin
index 45c2b99ab..e7129548e 100644
--- a/core/time/datetime/internal.odin
+++ b/core/time/datetime/internal.odin
@@ -1,3 +1,4 @@
+//+private
 package datetime
 
 // Internal helper functions for calendrical conversions
diff --git a/core/time/datetime/validation.odin b/core/time/datetime/validation.odin
index 87d5aa1cd..0a66833b0 100644
--- a/core/time/datetime/validation.odin
+++ b/core/time/datetime/validation.odin
@@ -1,14 +1,29 @@
 package datetime
-
 // Validation helpers
+
+/*
+Check if a year is a leap year.
+*/
 is_leap_year :: proc "contextless" (#any_int year: i64) -> (leap: bool) {
 	return year % 4 == 0 && (year % 100 != 0 || year % 400 == 0)
 }
 
+/*
+Check for errors in date formation.
+
+This procedure validates all fields of a date, and if any of the fields is
+outside of allowed range, an error is returned.
+*/
 validate_date :: proc "contextless" (date: Date) -> (err: Error) {
 	return validate(date.year, date.month, date.day)
 }
 
+/*
+Check for errors in date formation given date components.
+
+This procedure checks whether a date formed by the specified year month and a
+day is a valid date. If not, an error is returned.
+*/
 validate_year_month_day :: proc "contextless" (#any_int year, #any_int month, #any_int day: i64) -> (err: Error) {
 	if year < MIN_DATE.year || year > MAX_DATE.year {
 		return .Invalid_Year
@@ -29,6 +44,12 @@ validate_year_month_day :: proc "contextless" (#any_int year, #any_int month, #a
 	return .None
 }
 
+/*
+Check for errors in Ordinal
+
+This procedure checks if the ordinal is in a valid range for roundtrip
+conversions with the dates. If not, an error is returned.
+*/
 validate_ordinal :: proc "contextless" (ordinal: Ordinal) -> (err: Error) {
 	if ordinal < MIN_ORD || ordinal > MAX_ORD {
 		return .Invalid_Ordinal
@@ -36,10 +57,22 @@ validate_ordinal :: proc "contextless" (ordinal: Ordinal) -> (err: Error) {
 	return
 }
 
+/*
+Check for errors in time formation
+
+This procedure checks whether time has all fields in valid ranges, and if not
+an error is returned.
+*/
 validate_time :: proc "contextless" (time: Time) -> (err: Error) {
 	return validate(time.hour, time.minute, time.second, time.nano)
 }
 
+/*
+Check for errors in time formed by its components.
+
+This procedure checks whether the time formed by its components is valid, and
+if not an error is returned.
+*/
 validate_hour_minute_second :: proc "contextless" (#any_int hour, #any_int minute, #any_int second, #any_int nano: i64) -> (err: Error) {
 	if hour < 0 || hour > 23 {
 		return .Invalid_Hour
@@ -56,12 +89,21 @@ validate_hour_minute_second :: proc "contextless" (#any_int hour, #any_int minut
 	return .None
 }
 
+/*
+Check for errors in datetime formation.
+
+This procedure checks whether all fields of date and time in the specified
+datetime are valid, and if not, an error is returned.
+*/
 validate_datetime :: proc "contextless" (datetime: DateTime) -> (err: Error) {
 	validate(datetime.date) or_return
 	validate(datetime.time) or_return
 	return .None
 }
 
+/*
+Check for errors in date, time or datetime.
+*/
 validate :: proc{
 	validate_date,
 	validate_year_month_day,
diff --git a/core/time/iso8601.odin b/core/time/iso8601.odin
index 528e0b00a..f00107226 100644
--- a/core/time/iso8601.odin
+++ b/core/time/iso8601.odin
@@ -3,23 +3,62 @@ package time
 
 import dt "core:time/datetime"
 
-// Parses an ISO 8601 string and returns Time in UTC, with any UTC offset applied to it.
-// Only 4-digit years are accepted.
-// Optional pointer to boolean `is_leap` will return `true` if the moment was a leap second.
-// Leap seconds are smeared into 23:59:59.
+/*
+Parse an ISO 8601 string into a time with UTC offset applied to it.
+
+This procedure parses an ISO 8601 string of roughly the following format:
+
+```text
+YYYY-MM-DD[Tt]HH:mm:ss[.nn][Zz][+-]HH:mm
+```
+
+And returns time, in UTC represented by that string. In case the timezone offset
+is specified in the string, that timezone is applied to time.
+
+**Inputs**:
+- `iso_datetime`: The string to be parsed.
+- `is_leap`: Optional output parameter, specifying if the moment was a leap second.
+
+**Returns**:
+- `res`: The time represented by `iso_datetime`, with UTC offset applied.
+- `consumed`: Number of bytes consumed by parsing the string.
+
+**Notes**:
+- Only 4-digit years are accepted.
+- Leap seconds are smeared into 23:59:59.
+*/
 iso8601_to_time_utc :: proc(iso_datetime: string, is_leap: ^bool = nil) -> (res: Time, consumed: int) {
 	offset: int
-
 	res, offset, consumed = iso8601_to_time_and_offset(iso_datetime, is_leap)
 	res._nsec += (i64(-offset) * i64(Minute))
 	return res, consumed
 }
 
-// Parses an ISO 8601 string and returns Time and a UTC offset in minutes.
-// e.g. 1985-04-12T23:20:50.52Z
-// Note: Only 4-digit years are accepted.
-// Optional pointer to boolean `is_leap` will return `true` if the moment was a leap second.
-// Leap seconds are smeared into 23:59:59.
+/*
+Parse an ISO 8601 string into a time and a UTC offset in minutes.
+
+This procedure parses an ISO 8601 string of roughly the following format:
+
+```text
+YYYY-MM-DD[Tt]HH:mm:ss[.nn][Zz][+-]HH:mm
+```
+
+And returns time, in UTC represented by that string, and the UTC offset, in
+minutes.
+
+**Inputs**:
+- `iso_datetime`: The string to be parsed.
+- `is_leap`: Optional output parameter, specifying if the moment was a leap second.
+
+**Returns**:
+- `res`: The time in UTC.
+- `utc_offset`: The UTC offset of the time, in minutes.
+- `consumed`: Number of bytes consumed by parsing the string.
+
+**Notes**:
+- Only 4-digit years are accepted.
+- Leap seconds are smeared into 23:59:59.
+*/
 iso8601_to_time_and_offset :: proc(iso_datetime: string, is_leap: ^bool = nil) -> (res: Time, utc_offset: int, consumed: int) {
 	moment, offset, leap_second, count := iso8601_to_components(iso_datetime)
 	if count == 0 {
@@ -37,9 +76,32 @@ iso8601_to_time_and_offset :: proc(iso_datetime: string, is_leap: ^bool = nil) -
 	}
 }
 
-// Parses an ISO 8601 string and returns Time and a UTC offset in minutes.
-// e.g. 1985-04-12T23:20:50.52Z
-// Performs no validation on whether components are valid, e.g. it'll return hour = 25 if that's what it's given
+/*
+Parse an ISO 8601 string into a datetime and a UTC offset in minutes.
+
+This procedure parses an ISO 8601 string of roughly the following format:
+
+```text
+YYYY-MM-DD[Tt]HH:mm:ss[.nn][Zz][+-]HH:mm
+```
+
+And returns datetime, in UTC represented by that string, and the UTC offset, in
+minutes.
+
+**Inputs**:
+- `iso_datetime`: The string to be parsed
+
+**Returns**:
+- `res`: The parsed datetime, in UTC.
+- `utc_offset`: The UTC offset, in minutes.
+- `is_leap`: Specifies whether the moment was a leap second.
+- `consumed`: The number of bytes consumed by parsing the string.
+
+**Notes**:
+- This procedure performs no validation on whether components are valid,
+  e.g. it'll return hour = 25 if that's what it's given in the specified
+  string.
+*/
 iso8601_to_components :: proc(iso_datetime: string) -> (res: dt.DateTime, utc_offset: int, is_leap: bool, consumed: int) {
 	moment, offset, count, leap_second, ok := _iso8601_to_components(iso_datetime)
 	if !ok {
diff --git a/core/time/perf.odin b/core/time/perf.odin
index 123d67eca..784d7acd6 100644
--- a/core/time/perf.odin
+++ b/core/time/perf.odin
@@ -3,18 +3,39 @@ package time
 import "base:runtime"
 import "base:intrinsics"
 
+/*
+Type representing monotonic time, useful for measuring durations.
+*/
 Tick :: struct {
 	_nsec: i64, // relative amount
 }
+
+/*
+Obtain the current tick.
+*/
 tick_now :: proc "contextless" () -> Tick {
 	return _tick_now()
 }
 
+/*
+Obtain the difference between ticks.
+*/
 tick_diff :: proc "contextless" (start, end: Tick) -> Duration {
 	d := end._nsec - start._nsec
 	return Duration(d)
 }
 
+/*
+Incrementally obtain durations since last tick.
+
+This procedure returns the duration between the current tick and the tick
+stored in `prev` pointer, and then stores the current tick in location,
+specified by `prev`. If the prev pointer contains an zero-initialized tick,
+then the returned duration is 0.
+
+This procedure is meant to be used in a loop, or in other scenarios, where one
+might want to obtain time between multiple ticks at specific points.
+*/
 tick_lap_time :: proc "contextless" (prev: ^Tick) -> Duration {
 	d: Duration
 	t := tick_now()
@@ -25,17 +46,21 @@ tick_lap_time :: proc "contextless" (prev: ^Tick) -> Duration {
 	return d
 }
 
+/*
+Obtain the duration since last tick.
+*/
 tick_since :: proc "contextless" (start: Tick) -> Duration {
 	return tick_diff(start, tick_now())
 }
 
-
+/*
+Capture the duration the code in the current scope takes to execute.
+*/
 @(deferred_in_out=_tick_duration_end)
 SCOPED_TICK_DURATION :: proc "contextless" (d: ^Duration) -> Tick {
 	return tick_now()
 }
 
-
 _tick_duration_end :: proc "contextless" (d: ^Duration, t: Tick) {
 	d^ = tick_since(t)
 }
@@ -62,6 +87,13 @@ when ODIN_OS != .Darwin && ODIN_OS != .Linux && ODIN_OS != .FreeBSD {
 	}
 }
 
+/*
+Check if the CPU has invariant TSC.
+
+This procedure checks if the CPU contains an invariant TSC (Time stamp counter).
+Invariant TSC is a feature of modern processors that allows them to run their
+TSC at a fixed frequency, independent of ACPI state, and CPU frequency.
+*/
 has_invariant_tsc :: proc "contextless" () -> bool {
 	when ODIN_ARCH == .amd64 {
 		return x86_has_invariant_tsc()
@@ -70,6 +102,17 @@ has_invariant_tsc :: proc "contextless" () -> bool {
 	return false
 }
 
+/*
+Obtain the CPU's TSC frequency, in hertz.
+
+This procedure tries to obtain the CPU's TSC frequency in hertz. If the CPU
+doesn't have an invariant TSC, this procedure returns with an error. Otherwise
+an attempt is made to fetch the TSC frequency from the OS. If this fails,
+the frequency is obtained by sleeping for the specified amount of time and
+dividing the readings from TSC by the duration of the sleep.
+
+The duration of sleep can be controlled by `fallback_sleep` parameter.
+*/
 tsc_frequency :: proc "contextless" (fallback_sleep := 2 * Second) -> (u64, bool) {
 	if !has_invariant_tsc() {
 		return 0, false
@@ -93,37 +136,64 @@ tsc_frequency :: proc "contextless" (fallback_sleep := 2 * Second) -> (u64, bool
 	return hz, true
 }
 
-/*
-	Benchmark helpers
-*/
+// Benchmark helpers
 
+/*
+Errors returned by the `benchmark()` procedure.
+*/
 Benchmark_Error :: enum {
 	Okay = 0,
 	Allocation_Error,
 }
 
+/*
+Options for benchmarking.
+*/
 Benchmark_Options :: struct {
+	// The initialization procedure. `benchmark()` will call this before taking measurements.
 	setup:     #type proc(options: ^Benchmark_Options, allocator: runtime.Allocator) -> (err: Benchmark_Error),
+	// The procedure to benchmark.
 	bench:     #type proc(options: ^Benchmark_Options, allocator: runtime.Allocator) -> (err: Benchmark_Error),
+	// The deinitialization procedure.
 	teardown:  #type proc(options: ^Benchmark_Options, allocator: runtime.Allocator) -> (err: Benchmark_Error),
-
+	// Field to be used by `bench()` procedure for any purpose.
 	rounds:    int,
+	// Field to be used by `bench()` procedure for any purpose.
 	bytes:     int,
+	// Field to be used by `bench()` procedure for any purpose.
 	input:     []u8,
-
+	// `bench()` writes to specify the count of elements processed.
 	count:     int,
+	// `bench()` writes to specify the number of bytes processed.
 	processed: int,
+	// `bench()` can write the output slice here.
 	output:    []u8, // Unused for hash benchmarks
+	// `bench()` can write the output hash here.
 	hash:      u128,
-
-	/*
-		Performance
-	*/
+	// `benchmark()` procedure will output the duration of benchmark
 	duration:             Duration,
+	// `benchmark()` procedure will output the average count of elements
+	// processed per second, using the `count` field of this struct.
 	rounds_per_second:    f64,
+	// `benchmark()` procedure will output the average number of megabytes
+	// processed per second, using the `processed` field of this struct.
 	megabytes_per_second: f64,
 }
 
+/*
+Benchmark a procedure.
+
+This procedure produces a benchmark. The procedure specified in the `bench`
+field of the `options` parameter will be benchmarked. The following metrics
+can be obtained:
+
+- Run time of the procedure
+- Number of elements per second processed on average
+- Number of bytes per second this processed on average
+
+In order to obtain these metrics, the `bench()` procedure writes to `options`
+struct the number of elements or bytes it has processed.
+*/
 benchmark :: proc(options: ^Benchmark_Options, allocator := context.allocator) -> (err: Benchmark_Error) {
 	assert(options != nil)
 	assert(options.bench != nil)
diff --git a/core/time/rfc3339.odin b/core/time/rfc3339.odin
index 0a2d431b7..e4c6565d6 100644
--- a/core/time/rfc3339.odin
+++ b/core/time/rfc3339.odin
@@ -4,10 +4,33 @@ package time
 
 import dt "core:time/datetime"
 
-// Parses an RFC 3339 string and returns Time in UTC, with any UTC offset applied to it.
-// Only 4-digit years are accepted.
-// Optional pointer to boolean `is_leap` will return `true` if the moment was a leap second.
-// Leap seconds are smeared into 23:59:59.
+/*
+Parse an RFC 3339 string into time with a UTC offset applied to it.
+
+This procedure parses the specified RFC 3339 strings of roughly the following
+format:
+
+```text
+YYYY-MM-DD[Tt]HH:mm:ss[.nn][Zz][+-]HH:mm
+```
+
+And returns the time that was represented by the RFC 3339 string, with the UTC
+offset applied to it.
+
+**Inputs**:
+- `rfc_datetime`: An RFC 3339 string to parse.
+- `is_leap`: Optional output parameter specifying whether the moment was a leap
+  second.
+
+**Returns**:
+- `res`: The time, with UTC offset applied, that was parsed from the RFC 3339
+  string.
+- `consumed`: The number of bytes consumed by parsing the RFC 3339 string.
+
+**Notes**:
+- Only 4-digit years are accepted.
+- Leap seconds are smeared into 23:59:59.
+*/
 rfc3339_to_time_utc :: proc(rfc_datetime: string, is_leap: ^bool = nil) -> (res: Time, consumed: int) {
 	offset: int
 
@@ -16,11 +39,34 @@ rfc3339_to_time_utc :: proc(rfc_datetime: string, is_leap: ^bool = nil) -> (res:
 	return res, consumed
 }
 
-// Parses an RFC 3339 string and returns Time and a UTC offset in minutes.
-// e.g. 1985-04-12T23:20:50.52Z
-// Note: Only 4-digit years are accepted.
-// Optional pointer to boolean `is_leap` will return `true` if the moment was a leap second.
-// Leap seconds are smeared into 23:59:59.
+/*
+Parse an RFC 3339 string into a time and a UTC offset in minutes.
+
+This procedure parses the specified RFC 3339 strings of roughly the following
+format:
+
+```text
+YYYY-MM-DD[Tt]HH:mm:ss[.nn][Zz][+-]HH:mm
+```
+
+And returns the time, in UTC and a UTC offset, in minutes, that were represented
+by the RFC 3339 string.
+
+**Inputs**:
+- `rfc_datetime`: The RFC 3339 string to be parsed.
+- `is_leap`: Optional output parameter specifying whether the moment was a
+  leap second.
+
+**Returns**:
+- `res`: The time, in UTC, that was parsed from the RFC 3339 string.
+- `utc_offset`: The UTC offset, in minutes, that was parsed from the RFC 3339
+  string.
+- `consumed`: The number of bytes consumed by parsing the string.
+
+**Notes**:
+- Only 4-digit years are accepted.
+- Leap seconds are smeared into 23:59:59.
+*/
 rfc3339_to_time_and_offset :: proc(rfc_datetime: string, is_leap: ^bool = nil) -> (res: Time, utc_offset: int, consumed: int) {
 	moment, offset, leap_second, count := rfc3339_to_components(rfc_datetime)
 	if count == 0 {
@@ -38,9 +84,31 @@ rfc3339_to_time_and_offset :: proc(rfc_datetime: string, is_leap: ^bool = nil) -
 	}
 }
 
-// Parses an RFC 3339 string and returns Time and a UTC offset in minutes.
-// e.g. 1985-04-12T23:20:50.52Z
-// Performs no validation on whether components are valid, e.g. it'll return hour = 25 if that's what it's given
+/*
+Parse an RFC 3339 string into a datetime and a UTC offset in minutes.
+
+This procedure parses the specified RFC 3339 strings of roughly the following
+format:
+
+```text
+YYYY-MM-DD[Tt]HH:mm:ss[.nn][Zz][+-]HH:mm
+```
+
+And returns the datetime, in UTC and the UTC offset, in minutes, that were
+represented by the RFC 3339 string.
+
+**Inputs**:
+- `rfc_datetime`: The RFC 3339 string to parse.
+
+**Returns**:
+- `res`: The datetime, in UTC, that was parsed from the RFC 3339 string.
+- `utc_offset`: The UTC offset, in minutes, that was parsed from the RFC 3339
+  string.
+- `is_leap`: Specifies whether the moment was a leap second.
+- `consumed`: Number of bytes consumed by parsing the string.
+
+Performs no validation on whether components are valid, e.g. it'll return hour = 25 if that's what it's given
+*/
 rfc3339_to_components :: proc(rfc_datetime: string) -> (res: dt.DateTime, utc_offset: int, is_leap: bool, consumed: int) {
 	moment, offset, count, leap_second, ok := _rfc3339_to_components(rfc_datetime)
 	if !ok {
diff --git a/core/time/time.odin b/core/time/time.odin
index 4ea5afc70..fad6512f3 100644
--- a/core/time/time.odin
+++ b/core/time/time.odin
@@ -3,24 +3,73 @@ package time
 import    "base:intrinsics"
 import dt "core:time/datetime"
 
+/*
+Type representing duration, with nanosecond precision.
+This is the regular Unix timestamp, scaled to nanosecond precision.
+*/
 Duration :: distinct i64
 
+/*
+The duration equal to one nanosecond (1e-9 seconds).
+*/
 Nanosecond  :: Duration(1)
+
+/*
+The duration equal to one microsecond (1e-6 seconds).
+*/
 Microsecond :: 1000 * Nanosecond
+
+/*
+The duration equal to one millisecond (1e-3 seconds).
+*/
 Millisecond :: 1000 * Microsecond
+
+/*
+The duration equal to one second.
+*/
 Second      :: 1000 * Millisecond
+
+/*
+The duration equal to one minute (60 seconds).
+*/
 Minute      :: 60 * Second
+
+/*
+The duration equal to one hour (3600 seconds).
+*/
 Hour        :: 60 * Minute
 
+/*
+Minimum representable duration.
+*/
 MIN_DURATION :: Duration(-1 << 63)
+
+/*
+Maximum representable duration.
+*/
 MAX_DURATION :: Duration(1<<63 - 1)
 
+/*
+Value specifying whether the time procedures are supported by the current
+platform.
+*/
 IS_SUPPORTED :: _IS_SUPPORTED
 
+/*
+Specifies time since the UNIX epoch, with nanosecond precision.
+
+Capable of representing any time within the following range:
+
+- `min: 1677-09-21 00:12:44.145224192 +0000 UTC`
+- `max: 2262-04-11 23:47:16.854775807 +0000 UTC`
+*/
 Time :: struct {
 	_nsec: i64, // Measured in UNIX nanonseconds
 }
 
+/*
+Type representing a month.
+*/
 Month :: enum int {
 	January = 1,
 	February,
@@ -36,6 +85,9 @@ Month :: enum int {
 	December,
 }
 
+/*
+Type representing a weekday.
+*/
 Weekday :: enum int {
 	Sunday = 0,
 	Monday,
@@ -46,20 +98,37 @@ Weekday :: enum int {
 	Saturday,
 }
 
+/*
+Type representing a stopwatch.
+
+The stopwatch is used for measuring the total time in multiple "runs". When the
+stopwatch is started, it starts counting time. When the stopwatch is stopped,
+the difference in time between the last start and the stop is added to the
+total. When the stopwatch resets, the total is reset.
+*/
 Stopwatch :: struct {
 	running: bool,
 	_start_time: Tick,
 	_accumulation: Duration,
 }
 
+/*
+Obtain the current time.
+*/
 now :: proc "contextless" () -> Time {
 	return _now()
 }
 
+/*
+Sleep for the specified duration.
+*/
 sleep :: proc "contextless" (d: Duration) {
 	_sleep(d)
 }
 
+/*
+Start the stopwatch.
+*/
 stopwatch_start :: proc "contextless" (stopwatch: ^Stopwatch) {
 	if !stopwatch.running {
 		stopwatch._start_time = tick_now()
@@ -67,6 +136,9 @@ stopwatch_start :: proc "contextless" (stopwatch: ^Stopwatch) {
 	}
 }
 
+/*
+Stop the stopwatch.
+*/
 stopwatch_stop :: proc "contextless" (stopwatch: ^Stopwatch) {
 	if stopwatch.running {
 		stopwatch._accumulation += tick_diff(stopwatch._start_time, tick_now())
@@ -74,11 +146,21 @@ stopwatch_stop :: proc "contextless" (stopwatch: ^Stopwatch) {
 	}
 }
 
+/*
+Reset the stopwatch.
+*/
 stopwatch_reset :: proc "contextless" (stopwatch: ^Stopwatch) {
 	stopwatch._accumulation = {}
 	stopwatch.running = false
 }
 
+/*
+Obtain the total time, counted by the stopwatch.
+
+This procedure obtains the total time, counted by the stopwatch. If the stopwatch
+isn't stopped at the time of calling this procedure, the time between the last
+start and the current time is also accounted for.
+*/
 stopwatch_duration :: proc "contextless" (stopwatch: Stopwatch) -> Duration {
 	if !stopwatch.running {
 		return stopwatch._accumulation
@@ -86,40 +168,92 @@ stopwatch_duration :: proc "contextless" (stopwatch: Stopwatch) -> Duration {
 	return stopwatch._accumulation + tick_diff(stopwatch._start_time, tick_now())
 }
 
+/*
+Calculate the duration elapsed between two times.
+*/
 diff :: proc "contextless" (start, end: Time) -> Duration {
 	d := end._nsec - start._nsec
 	return Duration(d)
 }
 
+/*
+Calculate the duration elapsed since a specific time.
+*/
 since :: proc "contextless" (start: Time) -> Duration {
 	return diff(start, now())
 }
 
+/*
+Obtain the number of nanoseconds in a duration.
+*/
 duration_nanoseconds :: proc "contextless" (d: Duration) -> i64 {
 	return i64(d)
 }
+
+/*
+Obtain the number of microseconds in a duration.
+*/
 duration_microseconds :: proc "contextless" (d: Duration) -> f64 {
 	return duration_seconds(d) * 1e6
 }
+
+/*
+Obtain the number of milliseconds in a duration.
+*/
 duration_milliseconds :: proc "contextless" (d: Duration) -> f64 {
 	return duration_seconds(d) * 1e3
 }
+
+/*
+Obtain the number of seconds in a duration.
+*/
 duration_seconds :: proc "contextless" (d: Duration) -> f64 {
 	sec := d / Second
 	nsec := d % Second
 	return f64(sec) + f64(nsec)/1e9
 }
+
+/*
+Obtain the number of minutes in a duration.
+*/
 duration_minutes :: proc "contextless" (d: Duration) -> f64 {
 	min := d / Minute
 	nsec := d % Minute
 	return f64(min) + f64(nsec)/(60*1e9)
 }
+
+/*
+Obtain the number of hours in a duration.
+*/
 duration_hours :: proc "contextless" (d: Duration) -> f64 {
 	hour := d / Hour
 	nsec := d % Hour
 	return f64(hour) + f64(nsec)/(60*60*1e9)
 }
 
+/*
+Round a duration to a specific unit.
+
+This procedure rounds the duration to a specific unit.
+
+**Inputs**:
+- `d`: The duration to round.
+- `m`: The unit to round to.
+
+**Returns**:
+- The duration `d`, rounded to the unit specified by `m`.
+
+**Example**:
+
+In order to obtain the rough amount of seconds in a duration, the following call
+can be used:
+
+```
+time.duration_round(my_duration, time.Second)
+```
+
+**Note**: Any duration can be supplied as a unit.
+*/
 duration_round :: proc "contextless" (d, m: Duration) -> Duration {
 	_less_than_half :: #force_inline proc "contextless" (x, y: Duration) -> bool {
 		return u64(x)+u64(x) < u64(y)
@@ -149,50 +283,103 @@ duration_round :: proc "contextless" (d, m: Duration) -> Duration {
 	return MAX_DURATION
 }
 
+/*
+Truncate the duration to the specified unit.
+
+This procedure truncates the duration `d` to the unit specified by `m`.
+
+**Inputs**:
+- `d`: The duration to truncate.
+- `m`: The unit to truncate to.
+
+**Returns**:
+- The duration `d`, truncated to the unit specified by `m`.
+
+**Example**:
+
+In order to obtain the amount of whole seconds in a duration, the following call
+can be used:
+
+```
+time.duration_round(my_duration, time.Second)
+```
+
+**Note**: Any duration can be supplied as a unit.
+*/
 duration_truncate :: proc "contextless" (d, m: Duration) -> Duration {
 	return d if m <= 0 else d - d%m
 }
 
+/*
+Parse time into date components.
+*/
 date :: proc "contextless" (t: Time) -> (year: int, month: Month, day: int) {
 	year, month, day, _ = _abs_date(_time_abs(t), true)
 	return
 }
 
+/*
+Obtain the year of the date specified by time.
+*/
 year :: proc "contextless" (t: Time) -> (year: int) {
 	year, _, _, _ = _date(t, true)
 	return
 }
 
+/*
+Obtain the month of the date specified by time.
+*/
 month :: proc "contextless" (t: Time) -> (month: Month) {
 	_, month, _, _ = _date(t, true)
 	return
 }
 
+/*
+Obtain the day of the date specified by time.
+*/
 day :: proc "contextless" (t: Time) -> (day: int) {
 	_, _, day, _ = _date(t, true)
 	return
 }
 
+/*
+Obtain the week day of the date specified by time.
+*/
 weekday :: proc "contextless" (t: Time) -> (weekday: Weekday) {
 	abs := _time_abs(t)
 	sec := (abs + u64(Weekday.Monday) * SECONDS_PER_DAY) % SECONDS_PER_WEEK
 	return Weekday(int(sec) / SECONDS_PER_DAY)
 }
 
+/*
+Obtain the time components from a time, a duration or a stopwatch's total.
+*/
 clock :: proc { clock_from_time, clock_from_duration, clock_from_stopwatch }
 
+/*
+Obtain the time components from a time.
+*/
 clock_from_time :: proc "contextless" (t: Time) -> (hour, min, sec: int) {
 	return clock_from_seconds(_time_abs(t))
 }
 
+/*
+Obtain the time components from a duration.
+*/
 clock_from_duration :: proc "contextless" (d: Duration) -> (hour, min, sec: int) {
 	return clock_from_seconds(u64(d/1e9))
 }
 
+/*
+Obtain the time components from a stopwatch's total.
+*/
 clock_from_stopwatch :: proc "contextless" (s: Stopwatch) -> (hour, min, sec: int) {
 	return clock_from_duration(stopwatch_duration(s))
 }
 
+/*
+Obtain the time components from the number of seconds.
+*/
 clock_from_seconds :: proc "contextless" (nsec: u64) -> (hour, min, sec: int) {
 	sec = int(nsec % SECONDS_PER_DAY)
 	hour = sec / SECONDS_PER_HOUR
@@ -202,10 +389,16 @@ clock_from_seconds :: proc "contextless" (nsec: u64) -> (hour, min, sec: int) {
 	return
 }
 
+/*
+Read the timestamp counter of the CPU.
+*/
 read_cycle_counter :: proc "contextless" () -> u64 {
 	return u64(intrinsics.read_cycle_counter())
 }
 
+/*
+Obtain time from unix seconds and unix nanoseconds.
+*/
 unix :: proc "contextless" (sec: i64, nsec: i64) -> Time {
 	sec, nsec := sec, nsec
 	if nsec < 0 || nsec >= 1e9 {
@@ -220,31 +413,59 @@ unix :: proc "contextless" (sec: i64, nsec: i64) -> Time {
 	return Time{(sec*1e9 + nsec)}
 }
 
+/*
+Obtain time from unix nanoseconds.
+*/
 from_nanoseconds :: #force_inline proc "contextless" (nsec: i64) -> Time {
 	return Time{nsec}
 }
 
+/*
+Alias for `time_to_unix`.
+*/
 to_unix_seconds :: time_to_unix
+
+/*
+Obtain the Unix timestamp in seconds from a Time.
+*/
 time_to_unix :: proc "contextless" (t: Time) -> i64 {
 	return t._nsec/1e9
 }
 
+/*
+Alias for `time_to_unix_nano`.
+*/
 to_unix_nanoseconds :: time_to_unix_nano
+
+/*
+Obtain the Unix timestamp in nanoseconds from a Time.
+*/
 time_to_unix_nano :: proc "contextless" (t: Time) -> i64 {
 	return t._nsec
 }
 
+/*
+Add duration to a time.
+*/
 time_add :: proc "contextless" (t: Time, d: Duration) -> Time {
 	return Time{t._nsec + i64(d)}
 }
 
-// Accurate sleep borrowed from: https://blat-blatnik.github.io/computerBear/making-accurate-sleep-function/
-//
-// Accuracy seems to be pretty good out of the box on Linux, to within around 4µs worst case.
-// On Windows it depends but is comparable with regular sleep in the worst case.
-// To get the same kind of accuracy as on Linux, have your program call `windows.timeBeginPeriod(1)` to
-// tell Windows to use a more accurate timer for your process.
-// Additionally your program should call `windows.timeEndPeriod(1)` once you're done with `accurate_sleep`. 
+/*
+Accurate sleep
+
+This procedure sleeps for the duration specified by `d`, very accurately.
+
+**Note**: Implementation borrowed from: [this source](https://blat-blatnik.github.io/computerBear/making-accurate-sleep-function/)
+
+**Note(linux)**: The accuracy is within around 4µs (microseconds), in the worst case.
+
+**Note(windows)**: The accuracy depends but is comparable with regular sleep in
+the worst case. To get the same kind of accuracy as on Linux, have your program
+call `windows.timeBeginPeriod(1)` to tell Windows to use a more accurate timer
+for your process. Additionally your program should call `windows.timeEndPeriod(1)`
+once you're done with `accurate_sleep`. 
+*/
 accurate_sleep :: proc "contextless" (d: Duration) {
 	to_sleep, estimate, mean, m2, count: Duration
 
@@ -362,6 +583,13 @@ _abs_date :: proc "contextless" (abs: u64, full: bool) -> (year: int, month: Mon
 	return
 }
 
+/*
+Convert datetime components into time.
+
+This procedure calculates the time from datetime components supplied in the
+arguments to this procedure. If the datetime components don't represent a valid
+datetime, the function returns `false` in the second argument.
+*/
 components_to_time :: proc "contextless" (#any_int year, #any_int month, #any_int day, #any_int hour, #any_int minute, #any_int second: i64, #any_int nsec := i64(0)) -> (t: Time, ok: bool) {
 	this_date, err := dt.components_to_datetime(year, month, day, hour, minute, second, nsec)
 	if err != .None {
@@ -370,6 +598,12 @@ components_to_time :: proc "contextless" (#any_int year, #any_int month, #any_in
 	return compound_to_time(this_date)
 }
 
+/*
+Convert datetime into time.
+
+If the datetime represents a time outside of a valid range, `false` is returned
+as the second return value. See `Time` for the representable range.
+*/
 compound_to_time :: proc "contextless" (datetime: dt.DateTime) -> (t: Time, ok: bool) {
 	unix_epoch := dt.DateTime{{1970, 1, 1}, {0, 0, 0, 0}}
 	delta, err := dt.sub(datetime, unix_epoch)
@@ -387,12 +621,21 @@ compound_to_time :: proc "contextless" (datetime: dt.DateTime) -> (t: Time, ok:
 	return Time{_nsec=i64(nanoseconds)}, true
 }
 
+/*
+Convert datetime components into time.
+*/
 datetime_to_time :: proc{components_to_time, compound_to_time}
 
+/*
+Check if a year is a leap year.
+*/
 is_leap_year :: proc "contextless" (year: int) -> (leap: bool) {
 	return year % 4 == 0 && (year % 100 != 0 || year % 400 == 0)
 }
 
+/*
+Days before each month in a year, not counting the leap day on february 29th.
+*/
 @(rodata)
 days_before := [?]i32{
 	0,
@@ -410,11 +653,37 @@ days_before := [?]i32{
 	31 + 28 + 31 + 30 + 31 + 30 + 31 + 31 + 30 + 31 + 30 + 31,
 }
 
-
+/*
+Number of seconds in a minute (without leap seconds).
+*/
 SECONDS_PER_MINUTE :: 60
+
+/*
+Number of seconds in an hour (without leap seconds).
+*/
 SECONDS_PER_HOUR   :: 60 * SECONDS_PER_MINUTE
+
+/*
+Number of seconds in a day (without leap seconds).
+*/
 SECONDS_PER_DAY    :: 24 * SECONDS_PER_HOUR
+
+/*
+Number of seconds in a week (without leap seconds).
+*/
 SECONDS_PER_WEEK   ::  7 * SECONDS_PER_DAY
+
+/*
+Days in 400 years, with leap days.
+*/
 DAYS_PER_400_YEARS :: 365*400 + 97
+
+/*
+Days in 100 years, with leap days.
+*/
 DAYS_PER_100_YEARS :: 365*100 + 24
+
+/*
+Days in 4 years, with leap days.
+*/
 DAYS_PER_4_YEARS   :: 365*4   + 1
diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 000000000..27072a480
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,9 @@
+# Examples
+
+The `example` directory contains two packages:
+
+A [demo](examples/demo) illustrating the basics of Odin.
+
+It further contains [all](examples/all), which imports all [core](core) and [vendor](vendor) packages so we can conveniently run `odin check` on everything at once.
+
+For additional example code, see the [examples](https://github.com/odin-lang/examples) repository.
diff --git a/examples/all/all_main.odin b/examples/all/all_main.odin
index 976a0f0e5..d92a6b8c4 100644
--- a/examples/all/all_main.odin
+++ b/examples/all/all_main.odin
@@ -61,6 +61,7 @@ import cbor             "core:encoding/cbor"
 import csv              "core:encoding/csv"
 import endian           "core:encoding/endian"
 import hxa              "core:encoding/hxa"
+import ini              "core:encoding/ini"
 import json             "core:encoding/json"
 import varint           "core:encoding/varint"
 import xml              "core:encoding/xml"
@@ -193,6 +194,7 @@ _ :: base32
 _ :: base64
 _ :: csv
 _ :: hxa
+_ :: ini
 _ :: json
 _ :: varint
 _ :: xml
diff --git a/src/build_settings.cpp b/src/build_settings.cpp
index 4d3e20a7a..49bb83b22 100644
--- a/src/build_settings.cpp
+++ b/src/build_settings.cpp
@@ -440,6 +440,8 @@ struct BuildContext {
 	bool   cached;
 	BuildCacheData build_cache_data;
 
+	bool internal_no_inline;
+
 	bool   no_threaded_checker;
 
 	bool   show_debug_messages;
@@ -1649,11 +1651,24 @@ gb_internal void init_build_context(TargetMetrics *cross_target, Subtarget subta
 	if (!bc->custom_optimization_level) {
 		// NOTE(bill): when building with `-debug` but not specifying an optimization level
 		// default to `-o:none` to improve the debug symbol generation by default
-		bc->optimization_level = -1; // -o:none
+		if (bc->ODIN_DEBUG) {
+			bc->optimization_level = -1; // -o:none
+		} else {
+			bc->optimization_level = 0; // -o:minimal
+		}
 	}
 
 	bc->optimization_level = gb_clamp(bc->optimization_level, -1, 3);
 
+#if defined(GB_SYSTEM_WINDOWS)
+	if (bc->optimization_level <= 0) {
+		if (!is_arch_wasm()) {
+			bc->use_separate_modules = true;
+		}
+	}
+#endif
+
+
 	// TODO: Static map calls are bugged on `amd64sysv` abi.
 	if (bc->metrics.os != TargetOs_windows && bc->metrics.arch == TargetArch_amd64) {
 		// ENFORCE DYNAMIC MAP CALLS
diff --git a/src/cached.cpp b/src/cached.cpp
index 7f213ba21..4ad65ee9e 100644
--- a/src/cached.cpp
+++ b/src/cached.cpp
@@ -17,11 +17,11 @@ gb_internal bool recursively_delete_directory(wchar_t *wpath_c) {
 
 	wchar_t dir_path[MAX_PATH] = {};
 	wchar_t filename[MAX_PATH] = {};
-	wcscpy(dir_path, wpath_c);
-	wcscat(dir_path, L"\\*");
+	wcscpy_s(dir_path, wpath_c);
+	wcscat_s(dir_path, L"\\*");
 
-	wcscpy(filename, wpath_c);
-	wcscat(filename, L"\\");
+	wcscpy_s(filename, wpath_c);
+	wcscat_s(filename, L"\\");
 
 
 	WIN32_FIND_DATAW find_file_data = {};
@@ -31,21 +31,21 @@ gb_internal bool recursively_delete_directory(wchar_t *wpath_c) {
 	}
 	defer (FindClose(hfind));
 
-	wcscpy(dir_path, filename);
+	wcscpy_s(dir_path, filename);
 
 	for (;;) {
 		if (FindNextFileW(hfind, &find_file_data)) {
 			if (is_dots_w(find_file_data.cFileName)) {
 				continue;
 			}
-			wcscat(filename, find_file_data.cFileName);
+			wcscat_s(filename, find_file_data.cFileName);
 
 			if (find_file_data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
 				if (!recursively_delete_directory(filename)) {
 					return false;
 				}
 				RemoveDirectoryW(filename);
-				wcscpy(filename, dir_path);
+				wcscpy_s(filename, dir_path);
 			} else {
 				if (find_file_data.dwFileAttributes & FILE_ATTRIBUTE_READONLY) {
 					_wchmod(filename, _S_IWRITE);
@@ -53,7 +53,7 @@ gb_internal bool recursively_delete_directory(wchar_t *wpath_c) {
 				if (!DeleteFileW(filename)) {
 					return false;
 				}
-				wcscpy(filename, dir_path);
+				wcscpy_s(filename, dir_path);
 			}
 		} else {
 			if (GetLastError() == ERROR_NO_MORE_FILES) {
diff --git a/src/check_builtin.cpp b/src/check_builtin.cpp
index eec01b497..b6b1f9874 100644
--- a/src/check_builtin.cpp
+++ b/src/check_builtin.cpp
@@ -1079,7 +1079,7 @@ gb_internal bool check_builtin_simd_operation(CheckerContext *c, Operand *operan
 	return false;
 }
 
-gb_internal bool cache_load_file_directive(CheckerContext *c, Ast *call, String const &original_string, bool err_on_not_found, LoadFileCache **cache_, LoadFileTier tier) {
+gb_internal bool cache_load_file_directive(CheckerContext *c, Ast *call, String const &original_string, bool err_on_not_found, LoadFileCache **cache_, LoadFileTier tier, bool use_mutex=true) {
 	ast_node(ce, CallExpr, call);
 	ast_node(bd, BasicDirective, ce->proc);
 	String builtin_name = bd->name.string;
@@ -1101,7 +1101,8 @@ gb_internal bool cache_load_file_directive(CheckerContext *c, Ast *call, String
 		}
 	}
 
-	MUTEX_GUARD(&c->info->load_file_mutex);
+	if (use_mutex) mutex_lock(&c->info->load_file_mutex);
+	defer (if (use_mutex) mutex_unlock(&c->info->load_file_mutex));
 
 	gbFileError file_error = gbFileError_None;
 	String data = {};
@@ -1414,9 +1415,12 @@ gb_internal LoadDirectiveResult check_load_directory_directive(CheckerContext *c
 
 		file_caches = array_make<LoadFileCache *>(heap_allocator(), 0, files_to_reserve);
 
+		mutex_lock(&c->info->load_file_mutex);
+		defer (mutex_unlock(&c->info->load_file_mutex));
+
 		for (FileInfo fi : list) {
 			LoadFileCache *cache = nullptr;
-			if (cache_load_file_directive(c, call, fi.fullpath, err_on_not_found, &cache, LoadFileTier_Contents)) {
+			if (cache_load_file_directive(c, call, fi.fullpath, err_on_not_found, &cache, LoadFileTier_Contents, /*use_mutex*/false)) {
 				array_add(&file_caches, cache);
 			} else {
 				result = LoadDirective_Error;
@@ -4298,6 +4302,49 @@ gb_internal bool check_builtin_procedure(CheckerContext *c, Operand *operand, As
 		}
 		break;
 
+	case BuiltinProc_add_sat:
+	case BuiltinProc_sub_sat:
+		{
+			Operand x = {};
+			Operand y = {};
+			check_expr(c, &x, ce->args[0]);
+			check_expr(c, &y, ce->args[1]);
+			if (x.mode == Addressing_Invalid) {
+				return false;
+			}
+			if (y.mode == Addressing_Invalid) {
+				return false;
+			}
+			convert_to_typed(c, &y, x.type); if (y.mode == Addressing_Invalid) return false;
+			convert_to_typed(c, &x, y.type);
+			if (is_type_untyped(x.type)) {
+				gbString xts = type_to_string(x.type);
+				error(x.expr, "Expected a typed integer for '%.*s', got %s", LIT(builtin_name), xts);
+				gb_string_free(xts);
+				return false;
+			}
+			if (!is_type_integer(x.type)) {
+				gbString xts = type_to_string(x.type);
+				error(x.expr, "Expected an integer for '%.*s', got %s", LIT(builtin_name), xts);
+				gb_string_free(xts);
+				return false;
+			}
+			Type *ct = core_type(x.type);
+			if (is_type_different_to_arch_endianness(ct)) {
+				GB_ASSERT(ct->kind == Type_Basic);
+				if (ct->Basic.flags & (BasicFlag_EndianLittle|BasicFlag_EndianBig)) {
+					gbString xts = type_to_string(x.type);
+					error(x.expr, "Expected an integer which does not specify the explicit endianness for '%.*s', got %s", LIT(builtin_name), xts);
+					gb_string_free(xts);
+					return false;
+				}
+			}
+
+			operand->mode = Addressing_Value;
+			operand->type = default_type(x.type);
+		}
+		break;
+
 	case BuiltinProc_sqrt:
 		{
 			Operand x = {};
diff --git a/src/check_decl.cpp b/src/check_decl.cpp
index 7d81d102d..a1436fe03 100644
--- a/src/check_decl.cpp
+++ b/src/check_decl.cpp
@@ -182,8 +182,7 @@ gb_internal void override_entity_in_scope(Entity *original_entity, Entity *new_e
 	original_entity->type = new_entity->type;
 	original_entity->aliased_of = new_entity;
 
-	Ast *empty_ident = nullptr;
-	original_entity->identifier.compare_exchange_strong(empty_ident, new_entity->identifier);
+	original_entity->identifier.store(new_entity->identifier);
 
 	if (original_entity->identifier.load() != nullptr &&
 	    original_entity->identifier.load()->kind == Ast_Ident) {
@@ -1869,5 +1868,14 @@ gb_internal bool check_proc_body(CheckerContext *ctx_, Token token, DeclInfo *de
 
 	add_deps_from_child_to_parent(decl);
 
+	for (VariadicReuseData const &vr : decl->variadic_reuses) {
+		GB_ASSERT(vr.slice_type->kind == Type_Slice);
+		Type *elem = vr.slice_type->Slice.elem;
+		i64 size = type_size_of(elem);
+		i64 align = type_align_of(elem);
+		decl->variadic_reuse_max_bytes = gb_max(decl->variadic_reuse_max_bytes, size*vr.max_count);
+		decl->variadic_reuse_max_align = gb_max(decl->variadic_reuse_max_align, align);
+	}
+
 	return true;
 }
diff --git a/src/check_expr.cpp b/src/check_expr.cpp
index 12acca0cb..01ff9da5b 100644
--- a/src/check_expr.cpp
+++ b/src/check_expr.cpp
@@ -500,7 +500,9 @@ gb_internal bool find_or_generate_polymorphic_procedure(CheckerContext *old_c, E
 		nctx.no_polymorphic_errors = false;
 
 		// NOTE(bill): Reset scope from the failed procedure type
-		scope_reset(scope);
+		scope->head_child.store(nullptr, std::memory_order_relaxed);
+		string_map_clear(&scope->elements);
+		ptr_set_clear(&scope->imported);
 
 		// LEAK NOTE(bill): Cloning this AST may be leaky but this is not really an issue due to arena-based allocation
 		Ast *cloned_proc_type_node = clone_ast(pt->node);
@@ -6033,6 +6035,22 @@ gb_internal CallArgumentError check_call_arguments_internal(CheckerContext *c, A
 
 					Entity *vt = pt->params->Tuple.variables[pt->variadic_index];
 					o.type = vt->type;
+
+					// NOTE(bill, 2024-07-14): minimize the stack usage for variadic parameters with the backing array
+					if (c->decl) {
+						bool found = false;
+						for (auto &vr : c->decl->variadic_reuses) {
+							if (are_types_identical(vt->type, vr.slice_type)) {
+								vr.max_count = gb_max(vr.max_count, variadic_operands.count);
+								found = true;
+								break;
+							}
+						}
+						if (!found) {
+							array_add(&c->decl->variadic_reuses, VariadicReuseData{vt->type, variadic_operands.count});
+						}
+					}
+
 				} else {
 					dummy_argument_count += 1;
 					o.type = t_untyped_nil;
@@ -7888,12 +7906,15 @@ gb_internal ExprKind check_call_expr(CheckerContext *c, Operand *operand, Ast *c
 
 			// NOTE: Due to restrictions in LLVM you can not inline calls with a superset of features.
 			if (is_call_inlined) {
-				GB_ASSERT(c->curr_proc_decl);
-				GB_ASSERT(c->curr_proc_decl->entity);
-				GB_ASSERT(c->curr_proc_decl->entity->type->kind == Type_Proc);
-				String scope_features = c->curr_proc_decl->entity->type->Proc.enable_target_feature;
-				if (!check_target_feature_is_superset_of(scope_features, pt->Proc.enable_target_feature, &invalid)) {
-					error(call, "Inlined procedure enables target feature '%.*s', this requires the calling procedure to at least enable the same feature", LIT(invalid));
+				if (c->curr_proc_decl == nullptr) {
+					error(call, "Calling a '#force_inline' procedure that enables target features is not allowed at file scope");
+				} else {
+					GB_ASSERT(c->curr_proc_decl->entity);
+					GB_ASSERT(c->curr_proc_decl->entity->type->kind == Type_Proc);
+					String scope_features = c->curr_proc_decl->entity->type->Proc.enable_target_feature;
+					if (!check_target_feature_is_superset_of(scope_features, pt->Proc.enable_target_feature, &invalid)) {
+						error(call, "Inlined procedure enables target feature '%.*s', this requires the calling procedure to at least enable the same feature", LIT(invalid));
+					}
 				}
 			}
 		}
@@ -9926,10 +9947,14 @@ gb_internal ExprKind check_compound_literal(CheckerContext *c, Operand *o, Ast *
 		}
 		Type *et = base_type(t->BitSet.elem);
 		isize field_count = 0;
-		if (et->kind == Type_Enum) {
+		if (et != nullptr && et->kind == Type_Enum) {
 			field_count = et->Enum.fields.count;
 		}
 
+		if (is_type_array(bit_set_to_int(t))) {
+			is_constant = false;
+		}
+
 		if (cl->elems[0]->kind == Ast_FieldValue) {
 			error(cl->elems[0], "'field = value' in a bit_set a literal is not allowed");
 			is_constant = false;
diff --git a/src/check_stmt.cpp b/src/check_stmt.cpp
index f4d3bd6b8..76b6d3f40 100644
--- a/src/check_stmt.cpp
+++ b/src/check_stmt.cpp
@@ -1060,6 +1060,9 @@ gb_internal void check_switch_stmt(CheckerContext *ctx, Ast *node, u32 mod_flags
 	if (ss->tag != nullptr) {
 		check_expr(ctx, &x, ss->tag);
 		check_assignment(ctx, &x, nullptr, str_lit("switch expression"));
+		if (x.type == nullptr) {
+			return;
+		}
 	} else {
 		x.mode  = Addressing_Constant;
 		x.type  = t_bool;
@@ -1834,7 +1837,7 @@ gb_internal void check_range_stmt(CheckerContext *ctx, Ast *node, u32 mod_flags)
 
 			if (rs->vals.count == 1) {
 				Type *t = type_deref(operand.type);
-				if (is_type_map(t) || is_type_bit_set(t)) {
+				if (t != NULL && (is_type_map(t) || is_type_bit_set(t))) {
 					gbString v = expr_to_string(rs->vals[0]);
 					defer (gb_string_free(v));
 					error_line("\tSuggestion: place parentheses around the expression\n");
@@ -2514,7 +2517,7 @@ gb_internal void check_return_stmt(CheckerContext *ctx, Ast *node) {
 			Entity *e = entity_of_node(x);
 			if (is_entity_local_variable(e)) {
 				unsafe_return_error(o, "the address of a local variable");
-			} else if(x->kind == Ast_CompoundLit) {
+			} else if (x->kind == Ast_CompoundLit) {
 				unsafe_return_error(o, "the address of a compound literal");
 			} else if (x->kind == Ast_IndexExpr) {
 				Entity *f = entity_of_node(x->IndexExpr.expr);
@@ -2529,6 +2532,14 @@ gb_internal void check_return_stmt(CheckerContext *ctx, Ast *node) {
 					unsafe_return_error(o, "the address of an indexed variable", f->type);
 				}
 			}
+		} else if (expr->kind == Ast_SliceExpr) {
+			Ast *x = unparen_expr(expr->SliceExpr.expr);
+			Entity *e = entity_of_node(x);
+			if (is_entity_local_variable(e) && is_type_array(e->type)) {
+				unsafe_return_error(o, "a slice of a local variable");
+			} else if (x->kind == Ast_CompoundLit) {
+				unsafe_return_error(o, "a slice of a compound literal");
+			}
 		} else if (o.mode == Addressing_Constant && is_type_slice(o.type)) {
 			ERROR_BLOCK();
 			unsafe_return_error(o, "a compound literal of a slice");
diff --git a/src/check_type.cpp b/src/check_type.cpp
index dd8559114..428fe8451 100644
--- a/src/check_type.cpp
+++ b/src/check_type.cpp
@@ -939,22 +939,6 @@ gb_internal void check_enum_type(CheckerContext *ctx, Type *enum_type, Type *nam
 	enum_type->Enum.max_value_index = max_value_index;
 }
 
-gb_internal bool is_valid_bit_field_backing_type(Type *type) {
-	if (type == nullptr) {
-		return false;
-	}
-	type = base_type(type);
-	if (is_type_untyped(type)) {
-		return false;
-	}
-	if (is_type_integer(type)) {
-		return true;
-	}
-	if (type->kind == Type_Array) {
-		return is_type_integer(type->Array.elem);
-	}
-	return false;
-}
 
 gb_internal void check_bit_field_type(CheckerContext *ctx, Type *bit_field_type, Type *named_type, Ast *node) {
 	ast_node(bf, BitFieldType, node);
@@ -1268,11 +1252,14 @@ gb_internal void check_bit_set_type(CheckerContext *c, Type *type, Type *named_t
 		Type *t = default_type(lhs.type);
 		if (bs->underlying != nullptr) {
 			Type *u = check_type(c, bs->underlying);
+			// if (!is_valid_bit_field_backing_type(u)) {
 			if (!is_type_integer(u)) {
 				gbString ts = type_to_string(u);
 				error(bs->underlying, "Expected an underlying integer for the bit set, got %s", ts);
 				gb_string_free(ts);
-				return;
+				if (!is_valid_bit_field_backing_type(u)) {
+					return;
+				}
 			}
 			type->BitSet.underlying = u;
 		}
@@ -1572,11 +1559,30 @@ gb_internal Type *determine_type_from_polymorphic(CheckerContext *ctx, Type *pol
 		return poly_type;
 	}
 	if (show_error) {
+		ERROR_BLOCK();
 		gbString pts = type_to_string(poly_type);
 		gbString ots = type_to_string(operand.type, true);
 		defer (gb_string_free(pts));
 		defer (gb_string_free(ots));
 		error(operand.expr, "Cannot determine polymorphic type from parameter: '%s' to '%s'", ots, pts);
+
+		Type *pt = poly_type;
+		while (pt && pt->kind == Type_Generic && pt->Generic.specialized) {
+			pt = pt->Generic.specialized;
+		}
+		if (is_type_slice(pt) &&
+		    (is_type_dynamic_array(operand.type) || is_type_array(operand.type))) {
+			Ast *expr = unparen_expr(operand.expr);
+			if (expr->kind == Ast_CompoundLit) {
+				gbString es = type_to_string(base_any_array_type(operand.type));
+				error_line("\tSuggestion: Try using a slice compound literal instead '[]%s{...}'\n", es);
+				gb_string_free(es);
+			} else {
+				gbString os = expr_to_string(operand.expr);
+				error_line("\tSuggestion: Try slicing the value with '%s[:]'\n", os);
+				gb_string_free(os);
+			}
+		}
 	}
 	return t_invalid;
 }
@@ -1953,6 +1959,10 @@ gb_internal Type *check_get_params(CheckerContext *ctx, Scope *scope, Ast *_para
 					error(name, "'#by_ptr' can only be applied to variable fields");
 					p->flags &= ~FieldFlag_by_ptr;
 				}
+				if (p->flags&FieldFlag_no_capture) {
+					error(name, "'#no_capture' can only be applied to variable fields");
+					p->flags &= ~FieldFlag_no_capture;
+				}
 
 				param = alloc_entity_type_name(scope, name->Ident.token, type, EntityState_Resolved);
 				param->TypeName.is_type_alias = true;
@@ -2054,6 +2064,28 @@ gb_internal Type *check_get_params(CheckerContext *ctx, Scope *scope, Ast *_para
 						p->flags &= ~FieldFlag_by_ptr; // Remove the flag
 					}
 				}
+				if (p->flags&FieldFlag_no_capture) {
+					if (is_variadic && variadic_index == variables.count) {
+						if (p->flags & FieldFlag_c_vararg) {
+							error(name, "'#no_capture' cannot be applied to a #c_vararg parameter");
+							p->flags &= ~FieldFlag_no_capture;
+						} else {
+							error(name, "'#no_capture' is already implied on all variadic parameter");
+						}
+					} else if (is_type_polymorphic(type)) {
+						// ignore
+					} else {
+						if (is_type_internally_pointer_like(type)) {
+							error(name, "'#no_capture' is currently reserved for future use");
+						} else {
+							ERROR_BLOCK();
+							error(name, "'#no_capture' can only be applied to pointer-like types");
+							error_line("\t'#no_capture' does not currently do anything useful\n");
+							p->flags &= ~FieldFlag_no_capture;
+						}
+					}
+				}
+
 
 				if (is_poly_name) {
 					if (p->flags&FieldFlag_no_alias) {
@@ -2072,6 +2104,11 @@ gb_internal Type *check_get_params(CheckerContext *ctx, Scope *scope, Ast *_para
 						error(name, "'#by_ptr' can only be applied to variable fields");
 						p->flags &= ~FieldFlag_by_ptr;
 					}
+					if (p->flags&FieldFlag_no_capture) {
+						error(name, "'#no_capture' can only be applied to variable fields");
+						p->flags &= ~FieldFlag_no_capture;
+					}
+
 
 					if (!is_type_polymorphic(type) && check_constant_parameter_value(type, params[i])) {
 						// failed
@@ -2091,6 +2128,8 @@ gb_internal Type *check_get_params(CheckerContext *ctx, Scope *scope, Ast *_para
 				param->flags |= EntityFlag_Ellipsis;
 				if (is_c_vararg) {
 					param->flags |= EntityFlag_CVarArg;
+				} else {
+					param->flags |= EntityFlag_NoCapture;
 				}
 			}
 
@@ -2115,6 +2154,10 @@ gb_internal Type *check_get_params(CheckerContext *ctx, Scope *scope, Ast *_para
 			if (p->flags&FieldFlag_by_ptr) {
 				param->flags |= EntityFlag_ByPtr;
 			}
+			if (p->flags&FieldFlag_no_capture) {
+				param->flags |= EntityFlag_NoCapture;
+			}
+
 
 			param->state = EntityState_Resolved; // NOTE(bill): This should have be resolved whilst determining it
 			add_entity(ctx, scope, name, param);
@@ -2430,9 +2473,15 @@ gb_internal i64 check_array_count(CheckerContext *ctx, Operand *o, Ast *e) {
 	if (e == nullptr) {
 		return 0;
 	}
-	if (e->kind == Ast_UnaryExpr &&
-	    e->UnaryExpr.op.kind == Token_Question) {
-		return -1;
+	if (e->kind == Ast_UnaryExpr) {
+		Token op = e->UnaryExpr.op;
+		if (op.kind == Token_Question) {
+			return -1;
+		}
+		if (e->UnaryExpr.expr == nullptr) {
+			error(op, "Invalid array count '[%.*s]'", LIT(op.string));
+			return 0;
+		}
 	}
 
 	check_expr_or_type(ctx, o, e);
diff --git a/src/checker.cpp b/src/checker.cpp
index 8756cce1a..3eae271a0 100644
--- a/src/checker.cpp
+++ b/src/checker.cpp
@@ -50,15 +50,6 @@ gb_internal bool check_rtti_type_disallowed(Ast *expr, Type *type, char const *f
 	return check_rtti_type_disallowed(ast_token(expr), type, format);
 }
 
-gb_internal void scope_reset(Scope *scope) {
-	if (scope == nullptr) return;
-
-	rw_mutex_lock(&scope->mutex);
-	scope->head_child.store(nullptr, std::memory_order_relaxed);
-	string_map_clear(&scope->elements);
-	ptr_set_clear(&scope->imported);
-	rw_mutex_unlock(&scope->mutex);
-}
 
 gb_internal void scope_reserve(Scope *scope, isize count) {
 	string_map_reserve(&scope->elements, 2*count);
@@ -168,9 +159,6 @@ gb_internal void import_graph_node_swap(ImportGraphNode **data, isize i, isize j
 }
 
 
-
-
-
 gb_internal void init_decl_info(DeclInfo *d, Scope *scope, DeclInfo *parent) {
 	gb_zero_item(d);
 	if (parent) {
@@ -184,6 +172,9 @@ gb_internal void init_decl_info(DeclInfo *d, Scope *scope, DeclInfo *parent) {
 	ptr_set_init(&d->deps, 0);
 	ptr_set_init(&d->type_info_deps, 0);
 	d->labels.allocator = heap_allocator();
+	d->variadic_reuses.allocator = heap_allocator();
+	d->variadic_reuse_max_bytes = 0;
+	d->variadic_reuse_max_align = 1;
 }
 
 gb_internal DeclInfo *make_decl_info(Scope *scope, DeclInfo *parent) {
@@ -381,6 +372,7 @@ gb_internal Entity *scope_lookup_current(Scope *s, String const &name) {
 	return nullptr;
 }
 
+
 gb_internal void scope_lookup_parent(Scope *scope, String const &name, Scope **scope_, Entity **entity_) {
 	if (scope != nullptr) {
 		bool gone_thru_proc = false;
@@ -508,9 +500,15 @@ end:;
 	return result;
 }
 
+gb_global bool in_single_threaded_checker_stage = false;
+
 gb_internal Entity *scope_insert(Scope *s, Entity *entity) {
 	String name = entity->token.string;
-	return scope_insert_with_name(s, name, entity);
+	if (in_single_threaded_checker_stage) {
+		return scope_insert_with_name_no_mutex(s, name, entity);
+	} else {
+		return scope_insert_with_name(s, name, entity);
+	}
 }
 
 gb_internal Entity *scope_insert_no_mutex(Scope *s, Entity *entity) {
@@ -655,7 +653,7 @@ gb_internal bool check_vet_shadowing(Checker *c, Entity *e, VettedEntity *ve) {
 		}
 	}
 
-	zero_item(ve);
+	gb_zero_item(ve);
 	ve->kind = VettedEntity_Shadowed;
 	ve->entity = e;
 	ve->other = shadowed;
@@ -674,7 +672,7 @@ gb_internal bool check_vet_unused(Checker *c, Entity *e, VettedEntity *ve) {
 			}
 		case Entity_ImportName:
 		case Entity_LibraryName:
-			zero_item(ve);
+			gb_zero_item(ve);
 			ve->kind = VettedEntity_Unused;
 			ve->entity = e;
 			return true;
@@ -1114,7 +1112,11 @@ gb_internal void init_universal(void) {
 		int minimum_os_version = 0;
 		if (build_context.minimum_os_version_string != "") {
 			int major, minor, revision = 0;
+		#if defined(GB_SYSTEM_WINDOWS)
+			sscanf_s(cast(const char *)(build_context.minimum_os_version_string.text), "%d.%d.%d", &major, &minor, &revision);
+		#else
 			sscanf(cast(const char *)(build_context.minimum_os_version_string.text), "%d.%d.%d", &major, &minor, &revision);
+		#endif
 			minimum_os_version = (major*10000)+(minor*100)+revision;
 		}
 		add_global_constant("ODIN_MINIMUM_OS_VERSION", t_untyped_integer, exact_value_i64(minimum_os_version));
@@ -1386,7 +1388,7 @@ gb_internal void reset_checker_context(CheckerContext *ctx, AstFile *file, Untyp
 	auto type_path = ctx->type_path;
 	array_clear(type_path);
 
-	zero_size(&ctx->pkg, gb_size_of(CheckerContext) - gb_offset_of(CheckerContext, pkg));
+	gb_zero_size(&ctx->pkg, gb_size_of(CheckerContext) - gb_offset_of(CheckerContext, pkg));
 
 	ctx->file = nullptr;
 	ctx->scope = builtin_pkg->scope;
@@ -1788,8 +1790,7 @@ gb_internal void add_entity_use(CheckerContext *c, Ast *identifier, Entity *enti
 	if (identifier == nullptr || identifier->kind != Ast_Ident) {
 		return;
 	}
-	Ast *empty_ident = nullptr;
-	entity->identifier.compare_exchange_strong(empty_ident, identifier);
+	entity->identifier.store(identifier);
 
 	identifier->Ident.entity = entity;
 
@@ -4584,6 +4585,8 @@ gb_internal void check_single_global_entity(Checker *c, Entity *e, DeclInfo *d)
 }
 
 gb_internal void check_all_global_entities(Checker *c) {
+	in_single_threaded_checker_stage = true;
+
 	// NOTE(bill): This must be single threaded
 	// Don't bother trying
 	for_array(i, c->info.entities) {
@@ -4603,6 +4606,8 @@ gb_internal void check_all_global_entities(Checker *c) {
 			(void)type_align_of(e->type);
 		}
 	}
+
+	in_single_threaded_checker_stage = false;
 }
 
 
diff --git a/src/checker.hpp b/src/checker.hpp
index 781737140..d76e4c7d0 100644
--- a/src/checker.hpp
+++ b/src/checker.hpp
@@ -181,6 +181,11 @@ char const *ProcCheckedState_strings[ProcCheckedState_COUNT] {
 	"Checked",
 };
 
+struct VariadicReuseData {
+	Type *slice_type; // ..elem_type
+	i64 max_count;
+};
+
 // DeclInfo is used to store information of certain declarations to allow for "any order" usage
 struct DeclInfo {
 	DeclInfo *    parent; // NOTE(bill): only used for procedure literals at the moment
@@ -219,6 +224,10 @@ struct DeclInfo {
 
 	Array<BlockLabel> labels;
 
+	Array<VariadicReuseData> variadic_reuses;
+	i64 variadic_reuse_max_bytes;
+	i64 variadic_reuse_max_align;
+
 	// NOTE(bill): this is to prevent a race condition since these procedure literals can be created anywhere at any time
 	struct lbModule *code_gen_module;
 };
diff --git a/src/checker_builtin_procs.hpp b/src/checker_builtin_procs.hpp
index a90c52e61..3a2e1ce22 100644
--- a/src/checker_builtin_procs.hpp
+++ b/src/checker_builtin_procs.hpp
@@ -70,6 +70,9 @@ enum BuiltinProcId {
 	BuiltinProc_overflow_sub,
 	BuiltinProc_overflow_mul,
 
+	BuiltinProc_add_sat,
+	BuiltinProc_sub_sat,
+
 	BuiltinProc_sqrt,
 	BuiltinProc_fused_mul_add,
 
@@ -393,6 +396,9 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = {
 	{STR_LIT("overflow_sub"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("overflow_mul"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 
+	{STR_LIT("add_sat"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("sub_sat"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+
 	{STR_LIT("sqrt"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("fused_mul_add"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 
diff --git a/src/common.cpp b/src/common.cpp
index 69426e2a6..0ef39bd10 100644
--- a/src/common.cpp
+++ b/src/common.cpp
@@ -14,6 +14,8 @@
 #undef NOMINMAX
 #endif
 
+#include <string.h>
+
 #define GB_WINDOWS_H_INCLUDED
 #define GB_IMPLEMENTATION
 #include "gb/gb.h"
diff --git a/src/common_memory.cpp b/src/common_memory.cpp
index 60e570eee..47b2796a9 100644
--- a/src/common_memory.cpp
+++ b/src/common_memory.cpp
@@ -2,13 +2,6 @@
 #include <malloc.h>
 #endif
 
-gb_internal gb_inline void zero_size(void *ptr, isize len) {
-	memset(ptr, 0, len);
-}
-
-#define zero_item(ptr) zero_size((ptr), gb_size_of(ptr))
-
-
 template <typename U, typename V>
 gb_internal gb_inline U bit_cast(V &v) { return reinterpret_cast<U &>(v); }
 
@@ -39,6 +32,8 @@ gb_internal void virtual_memory_init(void) {
 }
 
 
+gb_internal Thread *get_current_thread(void);
+
 
 struct MemoryBlock {
 	MemoryBlock *prev;
@@ -50,8 +45,9 @@ struct MemoryBlock {
 struct Arena {
 	MemoryBlock * curr_block;
 	isize         minimum_block_size;
-	BlockingMutex mutex;
+	// BlockingMutex mutex;
 	isize         temp_count;
+	Thread *      parent_thread;
 };
 
 enum { DEFAULT_MINIMUM_BLOCK_SIZE = 8ll*1024ll*1024ll };
@@ -73,10 +69,20 @@ gb_internal isize arena_align_forward_offset(Arena *arena, isize alignment) {
 	return alignment_offset;
 }
 
+gb_internal void thread_init_arenas(Thread *t) {
+	t->permanent_arena = gb_alloc_item(heap_allocator(), Arena);
+	t->temporary_arena = gb_alloc_item(heap_allocator(), Arena);
+
+	t->permanent_arena->parent_thread = t;
+	t->temporary_arena->parent_thread = t;
+
+	t->permanent_arena->minimum_block_size = DEFAULT_MINIMUM_BLOCK_SIZE;
+	t->temporary_arena->minimum_block_size = DEFAULT_MINIMUM_BLOCK_SIZE;
+}
+
 gb_internal void *arena_alloc(Arena *arena, isize min_size, isize alignment) {
 	GB_ASSERT(gb_is_power_of_two(alignment));
-	
-	mutex_lock(&arena->mutex);
+	GB_ASSERT(arena->parent_thread == get_current_thread());
 
 	isize size = 0;
 	if (arena->curr_block != nullptr) {
@@ -102,9 +108,7 @@ gb_internal void *arena_alloc(Arena *arena, isize min_size, isize alignment) {
 	
 	curr_block->used += size;
 	GB_ASSERT(curr_block->used <= curr_block->size);
-	
-	mutex_unlock(&arena->mutex);
-	
+
 	// NOTE(bill): memory will be zeroed by default due to virtual memory 
 	return ptr;	
 }
@@ -259,7 +263,7 @@ struct ArenaTemp {
 
 ArenaTemp arena_temp_begin(Arena *arena) {
 	GB_ASSERT(arena);
-	MUTEX_GUARD(&arena->mutex);
+	GB_ASSERT(arena->parent_thread == get_current_thread());
 
 	ArenaTemp temp = {};
 	temp.arena = arena;
@@ -274,7 +278,7 @@ ArenaTemp arena_temp_begin(Arena *arena) {
 void arena_temp_end(ArenaTemp const &temp) {
 	GB_ASSERT(temp.arena);
 	Arena *arena = temp.arena;
-	MUTEX_GUARD(&arena->mutex);
+	GB_ASSERT(arena->parent_thread == get_current_thread());
 
 	if (temp.block) {
 		bool memory_block_found = false;
@@ -310,7 +314,7 @@ void arena_temp_end(ArenaTemp const &temp) {
 void arena_temp_ignore(ArenaTemp const &temp) {
 	GB_ASSERT(temp.arena);
 	Arena *arena = temp.arena;
-	MUTEX_GUARD(&arena->mutex);
+	GB_ASSERT(arena->parent_thread == get_current_thread());
 
 	GB_ASSERT_MSG(arena->temp_count > 0, "double-use of arena_temp_end");
 	arena->temp_count -= 1;
@@ -370,14 +374,65 @@ gb_internal GB_ALLOCATOR_PROC(arena_allocator_proc) {
 }
 
 
-gb_global gb_thread_local Arena permanent_arena = {nullptr, DEFAULT_MINIMUM_BLOCK_SIZE};
-gb_internal gbAllocator permanent_allocator() {
-	return arena_allocator(&permanent_arena);
+enum ThreadArenaKind : uintptr {
+	ThreadArena_Permanent,
+	ThreadArena_Temporary,
+};
+
+gb_global Arena default_permanent_arena = {nullptr, DEFAULT_MINIMUM_BLOCK_SIZE};
+gb_global Arena default_temporary_arena = {nullptr, DEFAULT_MINIMUM_BLOCK_SIZE};
+
+
+gb_internal Arena *get_arena(ThreadArenaKind kind) {
+	Thread *t = get_current_thread();
+	switch (kind) {
+	case ThreadArena_Permanent: return t ? t->permanent_arena : &default_permanent_arena;
+	case ThreadArena_Temporary: return t ? t->temporary_arena : &default_temporary_arena;
+	}
+	GB_PANIC("INVALID ARENA KIND");
+	return nullptr;
+}
+
+
+
+gb_internal GB_ALLOCATOR_PROC(thread_arena_allocator_proc) {
+	void *ptr = nullptr;
+	ThreadArenaKind kind = cast(ThreadArenaKind)cast(uintptr)allocator_data;
+	Arena *arena = get_arena(kind);
+
+	switch (type) {
+	case gbAllocation_Alloc:
+		ptr = arena_alloc(arena, size, alignment);
+		break;
+	case gbAllocation_Free:
+		break;
+	case gbAllocation_Resize:
+		if (size == 0) {
+			ptr = nullptr;
+		} else if (size <= old_size) {
+			ptr = old_memory;
+		} else {
+			ptr = arena_alloc(arena, size, alignment);
+			gb_memmove(ptr, old_memory, old_size);
+		}
+		break;
+	case gbAllocation_FreeAll:
+		GB_PANIC("use arena_free_all directly");
+		arena_free_all(arena);
+		break;
+	}
+
+	return ptr;
+}
+
+
+
+gb_internal gbAllocator permanent_allocator() {
+	return {thread_arena_allocator_proc, cast(void *)cast(uintptr)ThreadArena_Permanent};
 }
 
-gb_global gb_thread_local Arena temporary_arena = {nullptr, DEFAULT_MINIMUM_BLOCK_SIZE};
 gb_internal gbAllocator temporary_allocator() {
-	return arena_allocator(&temporary_arena);
+	return {thread_arena_allocator_proc, cast(void *)cast(uintptr)ThreadArena_Permanent};
 }
 
 
@@ -385,7 +440,7 @@ gb_internal gbAllocator temporary_allocator() {
 
 
 // #define TEMPORARY_ALLOCATOR_GUARD()
-#define TEMPORARY_ALLOCATOR_GUARD() TEMP_ARENA_GUARD(&temporary_arena)
+#define TEMPORARY_ALLOCATOR_GUARD() TEMP_ARENA_GUARD(get_arena(ThreadArena_Temporary))
 #define PERMANENT_ALLOCATOR_GUARD()
 
 
diff --git a/src/entity.cpp b/src/entity.cpp
index 41d84e0f7..db6ffdd52 100644
--- a/src/entity.cpp
+++ b/src/entity.cpp
@@ -45,7 +45,7 @@ enum EntityFlag : u64 {
 	EntityFlag_Value         = 1ull<<11,
 	EntityFlag_BitFieldField = 1ull<<12,
 
-
+	EntityFlag_NoCapture = 1ull<<13, // #no_capture
 
 	EntityFlag_PolyConst     = 1ull<<15,
 	EntityFlag_NotExported   = 1ull<<16,
diff --git a/src/gb/gb.h b/src/gb/gb.h
index 22a30a04b..38dabc9bd 100644
--- a/src/gb/gb.h
+++ b/src/gb/gb.h
@@ -2534,7 +2534,7 @@ gb_inline void const *gb_pointer_add_const(void const *ptr, isize bytes)       {
 gb_inline void const *gb_pointer_sub_const(void const *ptr, isize bytes)       { return cast(void const *)(cast(u8 const *)ptr - bytes); }
 gb_inline isize       gb_pointer_diff     (void const *begin, void const *end) { return cast(isize)(cast(u8 const *)end - cast(u8 const *)begin); }
 
-gb_inline void gb_zero_size(void *ptr, isize size) { gb_memset(ptr, 0, size); }
+gb_inline void gb_zero_size(void *ptr, isize size) { memset(ptr, 0, size); }
 
 
 #if defined(_MSC_VER) && !defined(__clang__)
diff --git a/src/llvm_abi.cpp b/src/llvm_abi.cpp
index b2e485d01..c21cd0a46 100644
--- a/src/llvm_abi.cpp
+++ b/src/llvm_abi.cpp
@@ -15,6 +15,7 @@ struct lbArgType {
 	LLVMAttributeRef align_attribute; // Optional
 	i64 byval_alignment;
 	bool is_byval;
+	bool no_capture;
 };
 
 
@@ -159,6 +160,11 @@ gb_internal void lb_add_function_type_attributes(LLVMValueRef fn, lbFunctionType
 			LLVMAddAttributeAtIndex(fn, arg_index+1, arg->align_attribute);
 		}
 
+		if (arg->no_capture) {
+			LLVMAddAttributeAtIndex(fn, arg_index+1, nocapture_attr);
+		}
+
+
 		if (ft->multiple_return_original_type) {
 			if (ft->original_arg_count <= i) {
 				LLVMAddAttributeAtIndex(fn, arg_index+1, noalias_attr);
@@ -645,10 +651,10 @@ namespace lbAbiAmd64SysV {
 		if (is_mem_cls(cls, attribute_kind)) {
 			LLVMAttributeRef attribute = nullptr;
 			if (attribute_kind == Amd64TypeAttribute_ByVal) {
-				// if (!is_calling_convention_odin(calling_convention)) {
-					return lb_arg_type_indirect_byval(c, type);
-				// }
-				// attribute = nullptr;
+				if (is_calling_convention_odin(calling_convention)) {
+					return lb_arg_type_indirect(type, attribute);
+				}
+				return lb_arg_type_indirect_byval(c, type);
 			} else if (attribute_kind == Amd64TypeAttribute_StructRect) {
 				attribute = lb_create_enum_attribute_with_type(c, "sret", type);
 			}
diff --git a/src/llvm_backend.cpp b/src/llvm_backend.cpp
index 52661dfa7..9fa570eaf 100644
--- a/src/llvm_backend.cpp
+++ b/src/llvm_backend.cpp
@@ -1,13 +1,11 @@
 #define MULTITHREAD_OBJECT_GENERATION 1
-
-#ifndef USE_SEPARATE_MODULES
-#define USE_SEPARATE_MODULES build_context.use_separate_modules
-#endif
-
 #ifndef MULTITHREAD_OBJECT_GENERATION
 #define MULTITHREAD_OBJECT_GENERATION 0
 #endif
 
+#ifndef USE_SEPARATE_MODULES
+#define USE_SEPARATE_MODULES build_context.use_separate_modules
+#endif
 
 #ifndef LLVM_IGNORE_VERIFICATION
 #define LLVM_IGNORE_VERIFICATION 0
@@ -137,19 +135,28 @@ gb_internal void lb_set_entity_from_other_modules_linkage_correctly(lbModule *ot
 	if (other_module == nullptr) {
 		return;
 	}
-	char const *cname = alloc_cstring(temporary_allocator(), name);
+	char const *cname = alloc_cstring(permanent_allocator(), name);
+	mpsc_enqueue(&other_module->gen->entities_to_correct_linkage, lbEntityCorrection{other_module, e, cname});
+}
 
-	LLVMValueRef other_global = nullptr;
-	if (e->kind == Entity_Variable) {
-		other_global = LLVMGetNamedGlobal(other_module->mod, cname);
-	} else if (e->kind == Entity_Procedure) {
-		other_global = LLVMGetNamedFunction(other_module->mod, cname);
-	}
-	if (other_global) {
-		LLVMSetLinkage(other_global, LLVMExternalLinkage);
+gb_internal void lb_correct_entity_linkage(lbGenerator *gen) {
+	for (lbEntityCorrection ec = {}; mpsc_dequeue(&gen->entities_to_correct_linkage, &ec); /**/) {
+		LLVMValueRef other_global = nullptr;
+		if (ec.e->kind == Entity_Variable) {
+			other_global = LLVMGetNamedGlobal(ec.other_module->mod, ec.cname);
+			if (other_global) {
+				LLVMSetLinkage(other_global, LLVMWeakAnyLinkage);
+			}
+		} else if (ec.e->kind == Entity_Procedure) {
+			other_global = LLVMGetNamedFunction(ec.other_module->mod, ec.cname);
+			if (other_global) {
+				LLVMSetLinkage(other_global, LLVMWeakAnyLinkage);
+			}
+		}
 	}
 }
 
+
 gb_internal void lb_emit_init_context(lbProcedure *p, lbAddr addr) {
 	TEMPORARY_ALLOCATOR_GUARD();
 
@@ -1387,6 +1394,7 @@ gb_internal void lb_create_global_procedures_and_types(lbGenerator *gen, Checker
 		if (USE_SEPARATE_MODULES) {
 			m = lb_module_of_entity(gen, e);
 		}
+		GB_ASSERT(m != nullptr);
 
 		if (e->kind == Entity_Procedure) {
 			array_add(&m->global_procedures_to_create, e);
@@ -1432,7 +1440,9 @@ gb_internal bool lb_is_module_empty(lbModule *m) {
 	}
 
 	for (auto g = LLVMGetFirstGlobal(m->mod); g != nullptr; g = LLVMGetNextGlobal(g)) {
-		if (LLVMGetLinkage(g) == LLVMExternalLinkage) {
+		LLVMLinkage linkage = LLVMGetLinkage(g);
+		if (linkage == LLVMExternalLinkage ||
+		    linkage == LLVMWeakAnyLinkage) {
 			continue;
 		}
 		if (!LLVMIsExternallyInitialized(g)) {
@@ -1570,6 +1580,7 @@ gb_internal WORKER_TASK_PROC(lb_llvm_module_pass_worker_proc) {
 
 	switch (build_context.optimization_level) {
 	case -1:
+		array_add(&passes, "function(annotation-remarks)");
 		break;
 	case 0:
 		array_add(&passes, "always-inline");
@@ -3260,7 +3271,7 @@ gb_internal bool lb_generate_code(lbGenerator *gen) {
 			LLVMSetLinkage(g.value, LLVMDLLExportLinkage);
 			LLVMSetDLLStorageClass(g.value, LLVMDLLExportStorageClass);
 		} else if (!is_foreign) {
-			LLVMSetLinkage(g.value, USE_SEPARATE_MODULES ? LLVMExternalLinkage : LLVMInternalLinkage);
+			LLVMSetLinkage(g.value, USE_SEPARATE_MODULES ? LLVMWeakAnyLinkage : LLVMInternalLinkage);
 		}
 		lb_set_linkage_from_entity_flags(m, g.value, e->flags);
 		
@@ -3277,11 +3288,12 @@ gb_internal bool lb_generate_code(lbGenerator *gen) {
 			if (!is_type_any(e->type) && !is_type_union(e->type)) {
 				if (tav.mode != Addressing_Invalid) {
 					if (tav.value.kind != ExactValue_Invalid) {
+						bool is_rodata = e->kind == Entity_Variable && e->Variable.is_rodata;
 						ExactValue v = tav.value;
-						lbValue init = lb_const_value(m, tav.type, v);
+						lbValue init = lb_const_value(m, tav.type, v, false, is_rodata);
 						LLVMSetInitializer(g.value, init.value);
 						var.is_initialized = true;
-						if (e->kind == Entity_Variable && e->Variable.is_rodata) {
+						if (is_rodata) {
 							LLVMSetGlobalConstant(g.value, true);
 						}
 					}
@@ -3430,6 +3442,8 @@ gb_internal bool lb_generate_code(lbGenerator *gen) {
 	TIME_SECTION("LLVM Add Foreign Library Paths");
 	lb_add_foreign_library_paths(gen);
 
+	TIME_SECTION("LLVM Correct Entity Linkage");
+	lb_correct_entity_linkage(gen);
 
 	////////////////////////////////////////////
 	for (auto const &entry: gen->modules) {
diff --git a/src/llvm_backend.hpp b/src/llvm_backend.hpp
index 005358734..02daecf6b 100644
--- a/src/llvm_backend.hpp
+++ b/src/llvm_backend.hpp
@@ -147,6 +147,7 @@ struct lbModule {
 	CheckerInfo *info;
 	AstPackage *pkg; // possibly associated
 	AstFile *file;   // possibly associated
+	char const *module_name;
 
 	PtrMap<Type *, LLVMTypeRef> types;                             // mutex: types_mutex
 	PtrMap<void *, lbStructFieldRemapping> struct_field_remapping; // Key: LLVMTypeRef or Type *, mutex: types_mutex
@@ -200,6 +201,12 @@ struct lbModule {
 	LLVMPassManagerRef function_pass_managers[lbFunctionPassManager_COUNT];
 };
 
+struct lbEntityCorrection {
+	lbModule *  other_module;
+	Entity *    e;
+	char const *cname;
+};
+
 struct lbGenerator : LinkerData {
 	CheckerInfo *info;
 
@@ -218,6 +225,8 @@ struct lbGenerator : LinkerData {
 	lbProcedure *startup_runtime;
 	lbProcedure *cleanup_runtime;
 	lbProcedure *objc_names;
+
+	MPSCQueue<lbEntityCorrection> entities_to_correct_linkage;
 };
 
 
@@ -296,6 +305,11 @@ enum lbProcedureFlag : u32 {
 	lbProcedureFlag_DebugAllocaCopy = 1<<1,
 };
 
+struct lbVariadicReuseSlices {
+	Type *slice_type;
+	lbAddr slice_addr;
+};
+
 struct lbProcedure {
 	u32 flags;
 	u16 state_flags;
@@ -336,8 +350,10 @@ struct lbProcedure {
 	bool             in_multi_assignment;
 	Array<LLVMValueRef> raw_input_parameters;
 
-	LLVMValueRef temp_callee_return_struct_memory;
+	Array<lbVariadicReuseSlices> variadic_reuses;
+	lbAddr variadic_reuse_base_array_ptr;
 
+	LLVMValueRef temp_callee_return_struct_memory;
 	Ast *curr_stmt;
 
 	Array<Scope *>       scope_stack;
@@ -364,7 +380,7 @@ struct lbProcedure {
 
 gb_internal bool lb_init_generator(lbGenerator *gen, Checker *c);
 
-gb_internal String lb_mangle_name(lbModule *m, Entity *e);
+gb_internal String lb_mangle_name(Entity *e);
 gb_internal String lb_get_entity_name(lbModule *m, Entity *e, String name = {});
 
 gb_internal LLVMAttributeRef lb_create_enum_attribute(LLVMContextRef ctx, char const *name, u64 value=0);
@@ -382,7 +398,7 @@ gb_internal lbBlock *lb_create_block(lbProcedure *p, char const *name, bool appe
 
 gb_internal lbValue lb_const_nil(lbModule *m, Type *type);
 gb_internal lbValue lb_const_undef(lbModule *m, Type *type);
-gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bool allow_local=true);
+gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bool allow_local=true, bool is_rodata=false);
 gb_internal lbValue lb_const_bool(lbModule *m, Type *type, bool value);
 gb_internal lbValue lb_const_int(lbModule *m, Type *type, u64 value);
 
diff --git a/src/llvm_backend_const.cpp b/src/llvm_backend_const.cpp
index 9cc0552de..6a6b119aa 100644
--- a/src/llvm_backend_const.cpp
+++ b/src/llvm_backend_const.cpp
@@ -338,6 +338,15 @@ gb_internal lbValue lb_emit_source_code_location_as_global_ptr(lbProcedure *p, S
 	return addr.addr;
 }
 
+gb_internal lbValue lb_const_source_code_location_as_global_ptr(lbModule *m, String const &procedure, TokenPos const &pos) {
+	lbValue loc = lb_const_source_code_location_const(m, procedure, pos);
+	lbAddr addr = lb_add_global_generated(m, loc.type, loc, nullptr);
+	lb_make_global_private_const(addr);
+	return addr.addr;
+}
+
+
+
 
 gb_internal lbValue lb_emit_source_code_location_as_global_ptr(lbProcedure *p, Ast *node) {
 	lbValue loc = lb_emit_source_code_location_const(p, node);
@@ -356,7 +365,11 @@ gb_internal lbValue lb_emit_source_code_location_as_global(lbProcedure *p, Ast *
 
 
 
-gb_internal LLVMValueRef lb_build_constant_array_values(lbModule *m, Type *type, Type *elem_type, isize count, LLVMValueRef *values, bool allow_local) {
+gb_internal LLVMValueRef lb_build_constant_array_values(lbModule *m, Type *type, Type *elem_type, isize count, LLVMValueRef *values, bool allow_local, bool is_rodata) {
+	if (allow_local) {
+		is_rodata = false;
+	}
+
 	bool is_local = allow_local && m->curr_procedure != nullptr;
 	bool is_const = true;
 	if (is_local) {
@@ -425,6 +438,8 @@ gb_internal LLVMValueRef lb_big_int_to_llvm(lbModule *m, Type *original_type, Bi
 		}
 	}
 
+	GB_ASSERT(!is_type_array(original_type));
+
 	LLVMValueRef value = LLVMConstIntOfArbitraryPrecision(lb_type(m, original_type), cast(unsigned)((sz+7)/8), cast(u64 *)rop);
 	if (big_int_is_neg(a)) {
 		value = LLVMConstNeg(value);
@@ -459,7 +474,11 @@ gb_internal bool lb_is_nested_possibly_constant(Type *ft, Selection const &sel,
 	return lb_is_elem_const(elem, ft);
 }
 
-gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bool allow_local) {
+gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bool allow_local, bool is_rodata) {
+	if (allow_local) {
+		is_rodata = false;
+	}
+
 	LLVMContextRef ctx = m->ctx;
 
 	type = default_type(type);
@@ -517,7 +536,7 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 			count = gb_max(cast(isize)cl->max_count, count);
 			Type *elem = base_type(type)->Slice.elem;
 			Type *t = alloc_type_array(elem, count);
-			lbValue backing_array = lb_const_value(m, t, value, allow_local);
+			lbValue backing_array = lb_const_value(m, t, value, allow_local, is_rodata);
 
 			LLVMValueRef array_data = nullptr;
 
@@ -554,6 +573,10 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 				array_data = LLVMAddGlobal(m->mod, lb_type(m, t), str);
 				LLVMSetInitializer(array_data, backing_array.value);
 
+				if (is_rodata) {
+					LLVMSetGlobalConstant(array_data, true);
+				}
+
 				lbValue g = {};
 				g.value = array_data;
 				g.type = t;
@@ -601,7 +624,7 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 		}
 		// NOTE(bill, 2021-10-07): Allow for array programming value constants
 		Type *core_elem = core_array_type(type);
-		return lb_const_value(m, core_elem, value, allow_local);		
+		return lb_const_value(m, core_elem, value, allow_local, is_rodata);
 	} else if (is_type_u8_array(type) && value.kind == ExactValue_String) {
 		GB_ASSERT(type->Array.count == value.value_string.len);
 		LLVMValueRef data = LLVMConstStringInContext(ctx,
@@ -619,7 +642,7 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 		Type *elem = type->Array.elem;
 
 
-		lbValue single_elem = lb_const_value(m, elem, value, allow_local);
+		lbValue single_elem = lb_const_value(m, elem, value, allow_local, is_rodata);
 
 		LLVMValueRef *elems = gb_alloc_array(permanent_allocator(), LLVMValueRef, cast(isize)count);
 		for (i64 i = 0; i < count; i++) {
@@ -637,7 +660,7 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 		
 		Type *elem = type->Matrix.elem;
 		
-		lbValue single_elem = lb_const_value(m, elem, value, allow_local);
+		lbValue single_elem = lb_const_value(m, elem, value, allow_local, is_rodata);
 		single_elem.value = llvm_const_cast(single_elem.value, lb_type(m, elem));
 				
 		i64 total_elem_count = matrix_type_total_internal_elems(type);
@@ -659,7 +682,7 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 		i64 count = type->SimdVector.count;
 		Type *elem = type->SimdVector.elem;
 
-		lbValue single_elem = lb_const_value(m, elem, value, allow_local);
+		lbValue single_elem = lb_const_value(m, elem, value, allow_local, is_rodata);
 		single_elem.value = llvm_const_cast(single_elem.value, lb_type(m, elem));
 
 		LLVMValueRef *elems = gb_alloc_array(permanent_allocator(), LLVMValueRef, count);
@@ -788,7 +811,7 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 
 	case ExactValue_Compound:
 		if (is_type_slice(type)) {
-			return lb_const_value(m, type, value, allow_local);
+			return lb_const_value(m, type, value, allow_local, is_rodata);
 		} else if (is_type_array(type)) {
 			ast_node(cl, CompoundLit, value.value_compound);
 			Type *elem_type = type->Array.elem;
@@ -822,7 +845,7 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 							}
 							if (lo == i) {
 								TypeAndValue tav = fv->value->tav;
-								LLVMValueRef val = lb_const_value(m, elem_type, tav.value, allow_local).value;
+								LLVMValueRef val = lb_const_value(m, elem_type, tav.value, allow_local, is_rodata).value;
 								for (i64 k = lo; k < hi; k++) {
 									values[value_index++] = val;
 								}
@@ -837,7 +860,7 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 							i64 index = exact_value_to_i64(index_tav.value);
 							if (index == i) {
 								TypeAndValue tav = fv->value->tav;
-								LLVMValueRef val = lb_const_value(m, elem_type, tav.value, allow_local).value;
+								LLVMValueRef val = lb_const_value(m, elem_type, tav.value, allow_local, is_rodata).value;
 								values[value_index++] = val;
 								found = true;
 								break;
@@ -850,7 +873,7 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 					}
 				}
 
-				res.value = lb_build_constant_array_values(m, type, elem_type, cast(isize)type->Array.count, values, allow_local);
+				res.value = lb_build_constant_array_values(m, type, elem_type, cast(isize)type->Array.count, values, allow_local, is_rodata);
 				return res;
 			} else {
 				GB_ASSERT_MSG(elem_count == type->Array.count, "%td != %td", elem_count, type->Array.count);
@@ -860,13 +883,13 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 				for (isize i = 0; i < elem_count; i++) {
 					TypeAndValue tav = cl->elems[i]->tav;
 					GB_ASSERT(tav.mode != Addressing_Invalid);
-					values[i] = lb_const_value(m, elem_type, tav.value, allow_local).value;
+					values[i] = lb_const_value(m, elem_type, tav.value, allow_local, is_rodata).value;
 				}
 				for (isize i = elem_count; i < type->Array.count; i++) {
 					values[i] = LLVMConstNull(lb_type(m, elem_type));
 				}
 
-				res.value = lb_build_constant_array_values(m, type, elem_type, cast(isize)type->Array.count, values, allow_local);
+				res.value = lb_build_constant_array_values(m, type, elem_type, cast(isize)type->Array.count, values, allow_local, is_rodata);
 				return res;
 			}
 		} else if (is_type_enumerated_array(type)) {
@@ -906,7 +929,7 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 							}
 							if (lo == i) {
 								TypeAndValue tav = fv->value->tav;
-								LLVMValueRef val = lb_const_value(m, elem_type, tav.value, allow_local).value;
+								LLVMValueRef val = lb_const_value(m, elem_type, tav.value, allow_local, is_rodata).value;
 								for (i64 k = lo; k < hi; k++) {
 									values[value_index++] = val;
 								}
@@ -921,7 +944,7 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 							i64 index = exact_value_to_i64(index_tav.value);
 							if (index == i) {
 								TypeAndValue tav = fv->value->tav;
-								LLVMValueRef val = lb_const_value(m, elem_type, tav.value, allow_local).value;
+								LLVMValueRef val = lb_const_value(m, elem_type, tav.value, allow_local, is_rodata).value;
 								values[value_index++] = val;
 								found = true;
 								break;
@@ -934,7 +957,7 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 					}
 				}
 
-				res.value = lb_build_constant_array_values(m, type, elem_type, cast(isize)type->EnumeratedArray.count, values, allow_local);
+				res.value = lb_build_constant_array_values(m, type, elem_type, cast(isize)type->EnumeratedArray.count, values, allow_local, is_rodata);
 				return res;
 			} else {
 				GB_ASSERT_MSG(elem_count == type->EnumeratedArray.count, "%td != %td", elem_count, type->EnumeratedArray.count);
@@ -944,13 +967,13 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 				for (isize i = 0; i < elem_count; i++) {
 					TypeAndValue tav = cl->elems[i]->tav;
 					GB_ASSERT(tav.mode != Addressing_Invalid);
-					values[i] = lb_const_value(m, elem_type, tav.value, allow_local).value;
+					values[i] = lb_const_value(m, elem_type, tav.value, allow_local, is_rodata).value;
 				}
 				for (isize i = elem_count; i < type->EnumeratedArray.count; i++) {
 					values[i] = LLVMConstNull(lb_type(m, elem_type));
 				}
 
-				res.value = lb_build_constant_array_values(m, type, elem_type, cast(isize)type->EnumeratedArray.count, values, allow_local);
+				res.value = lb_build_constant_array_values(m, type, elem_type, cast(isize)type->EnumeratedArray.count, values, allow_local, is_rodata);
 				return res;
 			}
 		} else if (is_type_simd_vector(type)) {
@@ -989,7 +1012,7 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 							}
 							if (lo == i) {
 								TypeAndValue tav = fv->value->tav;
-								LLVMValueRef val = lb_const_value(m, elem_type, tav.value, allow_local).value;
+								LLVMValueRef val = lb_const_value(m, elem_type, tav.value, allow_local, is_rodata).value;
 								for (i64 k = lo; k < hi; k++) {
 									values[value_index++] = val;
 								}
@@ -1004,7 +1027,7 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 							i64 index = exact_value_to_i64(index_tav.value);
 							if (index == i) {
 								TypeAndValue tav = fv->value->tav;
-								LLVMValueRef val = lb_const_value(m, elem_type, tav.value, allow_local).value;
+								LLVMValueRef val = lb_const_value(m, elem_type, tav.value, allow_local, is_rodata).value;
 								values[value_index++] = val;
 								found = true;
 								break;
@@ -1023,7 +1046,7 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 				for (isize i = 0; i < elem_count; i++) {
 					TypeAndValue tav = cl->elems[i]->tav;
 					GB_ASSERT(tav.mode != Addressing_Invalid);
-					values[i] = lb_const_value(m, elem_type, tav.value, allow_local).value;
+					values[i] = lb_const_value(m, elem_type, tav.value, allow_local, is_rodata).value;
 				}
 				LLVMTypeRef et = lb_type(m, elem_type);
 
@@ -1072,7 +1095,7 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 					i32 index = field_remapping[f->Variable.field_index];
 					if (elem_type_can_be_constant(f->type)) {
 						if (sel.index.count == 1) {
-							values[index]  = lb_const_value(m, f->type, tav.value, allow_local).value;
+							values[index]  = lb_const_value(m, f->type, tav.value, allow_local, is_rodata).value;
 							visited[index] = true;
 						} else {
 							if (!visited[index]) {
@@ -1114,7 +1137,7 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 									}
 								}
 								if (is_constant) {
-									LLVMValueRef elem_value = lb_const_value(m, tav.type, tav.value, allow_local).value;
+									LLVMValueRef elem_value = lb_const_value(m, tav.type, tav.value, allow_local, is_rodata).value;
 									if (LLVMIsConstant(elem_value)) {
 										values[index] = llvm_const_insert_value(m, values[index], elem_value, idx_list, idx_list_len);
 									} else {
@@ -1136,7 +1159,7 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 
 					i32 index = field_remapping[f->Variable.field_index];
 					if (elem_type_can_be_constant(f->type)) {
-						values[index]  = lb_const_value(m, f->type, val, allow_local).value;
+						values[index]  = lb_const_value(m, f->type, val, allow_local, is_rodata).value;
 						visited[index] = true;
 					}
 				}
@@ -1262,7 +1285,7 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 						
 						
 						TypeAndValue tav = fv->value->tav;
-						LLVMValueRef val = lb_const_value(m, elem_type, tav.value, allow_local).value;
+						LLVMValueRef val = lb_const_value(m, elem_type, tav.value, allow_local, is_rodata).value;
 						for (i64 k = lo; k < hi; k++) {
 							i64 offset = matrix_row_major_index_to_offset(type, k);
 							GB_ASSERT(values[offset] == nullptr);
@@ -1274,7 +1297,7 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 						i64 index = exact_value_to_i64(index_tav.value);
 						GB_ASSERT(index < max_count);
 						TypeAndValue tav = fv->value->tav;
-						LLVMValueRef val = lb_const_value(m, elem_type, tav.value, allow_local).value;
+						LLVMValueRef val = lb_const_value(m, elem_type, tav.value, allow_local, is_rodata).value;
 						i64 offset = matrix_row_major_index_to_offset(type, index);
 						GB_ASSERT(values[offset] == nullptr);
 						values[offset] = val;
@@ -1287,7 +1310,7 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 					}
 				}
 
-				res.value = lb_build_constant_array_values(m, type, elem_type, cast(isize)total_count, values, allow_local);
+				res.value = lb_build_constant_array_values(m, type, elem_type, cast(isize)total_count, values, allow_local, is_rodata);
 				return res;
 			} else {
 				GB_ASSERT_MSG(elem_count == max_count, "%td != %td", elem_count, max_count);
@@ -1298,7 +1321,7 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 					GB_ASSERT(tav.mode != Addressing_Invalid);
 					i64 offset = 0;
 					offset = matrix_row_major_index_to_offset(type, i);
-					values[offset] = lb_const_value(m, elem_type, tav.value, allow_local).value;
+					values[offset] = lb_const_value(m, elem_type, tav.value, allow_local, is_rodata).value;
 				}
 				for (isize i = 0; i < total_count; i++) {
 					if (values[i] == nullptr) {
@@ -1306,7 +1329,7 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bo
 					}
 				}
 
-				res.value = lb_build_constant_array_values(m, type, elem_type, cast(isize)total_count, values, allow_local);
+				res.value = lb_build_constant_array_values(m, type, elem_type, cast(isize)total_count, values, allow_local, is_rodata);
 				return res;
 			}
 		} else {
diff --git a/src/llvm_backend_debug.cpp b/src/llvm_backend_debug.cpp
index f1ace5f06..c896f889d 100644
--- a/src/llvm_backend_debug.cpp
+++ b/src/llvm_backend_debug.cpp
@@ -1187,6 +1187,7 @@ gb_internal void add_debug_info_for_global_constant_from_entity(lbGenerator *gen
 	if (USE_SEPARATE_MODULES) {
 		m = lb_module_of_entity(gen, e);
 	}
+	GB_ASSERT(m != nullptr);
 
 	if (is_type_integer(e->type)) {
 		ExactValue const &value = e->Constant.value;
diff --git a/src/llvm_backend_expr.cpp b/src/llvm_backend_expr.cpp
index bcacc0537..4bb2676d1 100644
--- a/src/llvm_backend_expr.cpp
+++ b/src/llvm_backend_expr.cpp
@@ -296,12 +296,6 @@ gb_internal bool lb_try_direct_vector_arith(lbProcedure *p, TokenKind op, lbValu
 		GB_ASSERT(vector_type0 == vector_type1);
 		LLVMTypeRef vector_type = vector_type0;
 
-		LLVMValueRef lhs_vp = LLVMBuildPointerCast(p->builder, lhs_ptr.value, LLVMPointerType(vector_type, 0), "");
-		LLVMValueRef rhs_vp = LLVMBuildPointerCast(p->builder, rhs_ptr.value, LLVMPointerType(vector_type, 0), "");
-		LLVMValueRef x = LLVMBuildLoad2(p->builder, vector_type, lhs_vp, "");
-		LLVMValueRef y = LLVMBuildLoad2(p->builder, vector_type, rhs_vp, "");
-		LLVMValueRef z = nullptr;
-
 		Type *integral_type = base_type(elem_type);
 		if (is_type_simd_vector(integral_type)) {
 			integral_type = core_array_type(integral_type);
@@ -311,8 +305,18 @@ gb_internal bool lb_try_direct_vector_arith(lbProcedure *p, TokenKind op, lbValu
 			case Token_Add: op = Token_Or;     break;
 			case Token_Sub: op = Token_AndNot; break;
 			}
+			Type *u = bit_set_to_int(type);
+			if (is_type_array(u)) {
+				return false;
+			}
 		}
 
+		LLVMValueRef lhs_vp = LLVMBuildPointerCast(p->builder, lhs_ptr.value, LLVMPointerType(vector_type, 0), "");
+		LLVMValueRef rhs_vp = LLVMBuildPointerCast(p->builder, rhs_ptr.value, LLVMPointerType(vector_type, 0), "");
+		LLVMValueRef x = LLVMBuildLoad2(p->builder, vector_type, lhs_vp, "");
+		LLVMValueRef y = LLVMBuildLoad2(p->builder, vector_type, rhs_vp, "");
+		LLVMValueRef z = nullptr;
+
 		if (is_type_float(integral_type)) {
 			switch (op) {
 			case Token_Add:
@@ -1286,6 +1290,14 @@ handle_op:;
 		case Token_Add: op = Token_Or;     break;
 		case Token_Sub: op = Token_AndNot; break;
 		}
+		Type *u = bit_set_to_int(type);
+		if (is_type_array(u)) {
+			lhs.type = u;
+			rhs.type = u;
+			res = lb_emit_arith(p, op, lhs, rhs, u);
+			res.type = type;
+			return res;
+		}
 	}
 
 	Type *integral_type = type;
@@ -1441,6 +1453,7 @@ gb_internal lbValue lb_build_binary_in(lbProcedure *p, lbValue left, lbValue rig
 			GB_ASSERT(are_types_identical(left.type, key_type));
 
 			Type *it = bit_set_to_int(rt);
+
 			left = lb_emit_conv(p, left, it);
 			if (is_type_different_to_arch_endianness(it)) {
 				left = lb_emit_byte_swap(p, left, integer_endian_type_to_platform_type(it));
@@ -2054,6 +2067,26 @@ gb_internal lbValue lb_emit_conv(lbProcedure *p, lbValue value, Type *t) {
 		}
 	}
 
+	// bit_set <-> backing type
+	if (is_type_bit_set(src)) {
+		Type *backing = bit_set_to_int(src);
+		if (are_types_identical(backing, dst)) {
+			lbValue res = {};
+			res.type = t;
+			res.value = value.value;
+			return res;
+		}
+	}
+	if (is_type_bit_set(dst)) {
+		Type *backing = bit_set_to_int(dst);
+		if (are_types_identical(src, backing)) {
+			lbValue res = {};
+			res.type = t;
+			res.value = value.value;
+			return res;
+		}
+	}
+
 
 	// Pointer <-> uintptr
 	if (is_type_pointer(src) && is_type_uintptr(dst)) {
@@ -2951,13 +2984,32 @@ gb_internal lbValue lb_emit_comp_against_nil(lbProcedure *p, TokenKind op_kind,
 	case Type_Pointer:
 	case Type_MultiPointer:
 	case Type_Proc:
-	case Type_BitSet:
 		if (op_kind == Token_CmpEq) {
 			res.value = LLVMBuildIsNull(p->builder, x.value, "");
 		} else if (op_kind == Token_NotEq) {
 			res.value = LLVMBuildIsNotNull(p->builder, x.value, "");
 		}
 		return res;
+	case Type_BitSet:
+		{
+			Type *u = bit_set_to_int(bt);
+			if (is_type_array(u)) {
+				auto args = array_make<lbValue>(permanent_allocator(), 2);
+				lbValue lhs = lb_address_from_load_or_generate_local(p, x);
+				args[0] = lb_emit_conv(p, lhs, t_rawptr);
+				args[1] = lb_const_int(p->module, t_int, type_size_of(t));
+				lbValue val = lb_emit_runtime_call(p, "memory_compare_zero", args);
+				lbValue res = lb_emit_comp(p, op_kind, val, lb_const_int(p->module, t_int, 0));
+				return res;
+			} else {
+				if (op_kind == Token_CmpEq) {
+					res.value = LLVMBuildIsNull(p->builder, x.value, "");
+				} else if (op_kind == Token_NotEq) {
+					res.value = LLVMBuildIsNotNull(p->builder, x.value, "");
+				}
+			}
+			return res;
+		}
 
 	case Type_Slice:
 		{
@@ -4878,29 +4930,43 @@ gb_internal lbAddr lb_build_addr_compound_lit(lbProcedure *p, Ast *expr) {
 	case Type_BitSet: {
 		i64 sz = type_size_of(type);
 		if (cl->elems.count > 0 && sz > 0) {
-			lb_addr_store(p, v, lb_const_value(p->module, type, exact_value_compound(expr)));
-
 			lbValue lower = lb_const_value(p->module, t_int, exact_value_i64(bt->BitSet.lower));
-			for (Ast *elem : cl->elems) {
-				GB_ASSERT(elem->kind != Ast_FieldValue);
 
-				if (lb_is_elem_const(elem, et)) {
-					continue;
+			Type *backing = bit_set_to_int(type);
+			if (is_type_array(backing)) {
+				GB_PANIC("TODO: bit_set [N]T");
+				Type *base_it = core_array_type(backing);
+				i64 bits_per_elem = 8*type_size_of(base_it);
+				gb_unused(bits_per_elem);
+				lbValue one = lb_const_value(p->module, t_i64, exact_value_i64(1));
+				for (Ast *elem : cl->elems) {
+					GB_ASSERT(elem->kind != Ast_FieldValue);
+					lbValue expr = lb_build_expr(p, elem);
+					GB_ASSERT(expr.type->kind != Type_Tuple);
+
+					lbValue e = lb_emit_conv(p, expr, t_i64);
+					e = lb_emit_arith(p, Token_Sub, e, lower, t_i64);
+					// lbValue idx = lb_emit_arith(p, Token_Div, e, bits_per_elem, t_i64);
+					// lbValue val = lb_emit_arith(p, Token_Div, e, bits_per_elem, t_i64);
 				}
-
-				lbValue expr = lb_build_expr(p, elem);
-				GB_ASSERT(expr.type->kind != Type_Tuple);
-
+			} else {
 				Type *it = bit_set_to_int(bt);
 				lbValue one = lb_const_value(p->module, it, exact_value_i64(1));
-				lbValue e = lb_emit_conv(p, expr, it);
-				e = lb_emit_arith(p, Token_Sub, e, lower, it);
-				e = lb_emit_arith(p, Token_Shl, one, e, it);
+				for (Ast *elem : cl->elems) {
+					GB_ASSERT(elem->kind != Ast_FieldValue);
 
-				lbValue old_value = lb_emit_transmute(p, lb_addr_load(p, v), it);
-				lbValue new_value = lb_emit_arith(p, Token_Or, old_value, e, it);
-				new_value = lb_emit_transmute(p, new_value, type);
-				lb_addr_store(p, v, new_value);
+					lbValue expr = lb_build_expr(p, elem);
+					GB_ASSERT(expr.type->kind != Type_Tuple);
+
+					lbValue e = lb_emit_conv(p, expr, it);
+					e = lb_emit_arith(p, Token_Sub, e, lower, it);
+					e = lb_emit_arith(p, Token_Shl, one, e, it);
+
+					lbValue old_value = lb_emit_transmute(p, lb_addr_load(p, v), it);
+					lbValue new_value = lb_emit_arith(p, Token_Or, old_value, e, it);
+					new_value = lb_emit_transmute(p, new_value, type);
+					lb_addr_store(p, v, new_value);
+				}
 			}
 		}
 		break;
diff --git a/src/llvm_backend_general.cpp b/src/llvm_backend_general.cpp
index f5595b70e..a91c1d1fe 100644
--- a/src/llvm_backend_general.cpp
+++ b/src/llvm_backend_general.cpp
@@ -29,8 +29,9 @@ gb_internal void lb_init_module(lbModule *m, Checker *c) {
 		module_name = gb_string_appendc(module_name, "-builtin");
 	}
 
+	m->module_name = module_name ? module_name : "odin_package";
 	m->ctx = LLVMContextCreate();
-	m->mod = LLVMModuleCreateWithNameInContext(module_name ? module_name : "odin_package", m->ctx);
+	m->mod = LLVMModuleCreateWithNameInContext(m->module_name, m->ctx);
 	// m->debug_builder = nullptr;
 	if (build_context.ODIN_DEBUG) {
 		enum {DEBUG_METADATA_VERSION = 3};
@@ -71,7 +72,7 @@ gb_internal void lb_init_module(lbModule *m, Checker *c) {
 	map_init(&m->hasher_procs);
 	map_init(&m->map_get_procs);
 	map_init(&m->map_set_procs);
-	if (build_context.use_separate_modules) {
+	if (USE_SEPARATE_MODULES) {
 		array_init(&m->procedures_to_generate, a, 0, 1<<10);
 		map_init(&m->procedure_values,               1<<11);
 	} else {
@@ -151,6 +152,8 @@ gb_internal bool lb_init_generator(lbGenerator *gen, Checker *c) {
 		map_set(&gen->modules_through_ctx, ctx, m);
 	}
 
+	mpsc_init(&gen->entities_to_correct_linkage, heap_allocator());
+
 	return true;
 }
 
@@ -387,12 +390,14 @@ gb_internal lbModule *lb_module_of_entity(lbGenerator *gen, Entity *e) {
 	if (e->file) {
 		found = map_get(&gen->modules, cast(void *)e->file);
 		if (found) {
+			GB_ASSERT(*found != nullptr);
 			return *found;
 		}
 	}
 	if (e->pkg) {
 		found = map_get(&gen->modules, cast(void *)e->pkg);
 		if (found) {
+			GB_ASSERT(*found != nullptr);
 			return *found;
 		}
 	}
@@ -1018,6 +1023,8 @@ gb_internal void lb_emit_store(lbProcedure *p, lbValue ptr, lbValue value) {
 			LLVMTypeRef rawptr_type = lb_type(p->module, t_rawptr);
 			LLVMTypeRef rawptr_ptr_type = LLVMPointerType(rawptr_type, 0);
 			LLVMBuildStore(p->builder, LLVMConstNull(rawptr_type), LLVMBuildBitCast(p->builder, ptr.value, rawptr_ptr_type, ""));
+		} else if (is_type_bit_set(a)) {
+			lb_mem_zero_ptr(p, ptr.value, a, 1);
 		} else if (lb_sizeof(src_t) <= lb_max_zero_init_size()) {
 			LLVMBuildStore(p->builder, LLVMConstNull(src_t), ptr.value);
 		} else {
@@ -1105,7 +1112,7 @@ gb_internal lbValue lb_emit_load(lbProcedure *p, lbValue value) {
 		return lb_addr_load(p, addr);
 	}
 
-	GB_ASSERT(is_type_pointer(value.type));
+	GB_ASSERT_MSG(is_type_pointer(value.type), "%s", type_to_string(value.type));
 	Type *t = type_deref(value.type);
 	LLVMValueRef v = LLVMBuildLoad2(p->builder, lb_type(p->module, t), value.value, "");
 
@@ -1530,7 +1537,7 @@ gb_internal void lb_clone_struct_type(LLVMTypeRef dst, LLVMTypeRef src) {
 	LLVMStructSetBody(dst, fields, field_count, LLVMIsPackedStruct(src));
 }
 
-gb_internal String lb_mangle_name(lbModule *m, Entity *e) {
+gb_internal String lb_mangle_name(Entity *e) {
 	String name = e->token.string;
 
 	AstPackage *pkg = e->pkg;
@@ -1630,6 +1637,7 @@ gb_internal String lb_set_nested_type_name_ir_mangled_name(Entity *e, lbProcedur
 }
 
 gb_internal String lb_get_entity_name(lbModule *m, Entity *e, String default_name) {
+	GB_ASSERT(m != nullptr);
 	if (e != nullptr && e->kind == Entity_TypeName && e->TypeName.ir_mangled_name.len != 0) {
 		return e->TypeName.ir_mangled_name;
 	}
@@ -1661,7 +1669,7 @@ gb_internal String lb_get_entity_name(lbModule *m, Entity *e, String default_nam
 	}
 
 	if (!no_name_mangle) {
-		name = lb_mangle_name(m, e);
+		name = lb_mangle_name(e);
 	}
 	if (name.len == 0) {
 		name = e->token.string;
@@ -3045,7 +3053,7 @@ gb_internal lbValue lb_find_value_from_entity(lbModule *m, Entity *e) {
 			if (e->code_gen_module != nullptr) {
 				other_module = e->code_gen_module;
 			} else {
-				other_module = nullptr;
+				other_module = &m->gen->default_module;
 			}
 			is_external = other_module != m;
 		}
@@ -3063,8 +3071,6 @@ gb_internal lbValue lb_find_value_from_entity(lbModule *m, Entity *e) {
 
 			lb_set_entity_from_other_modules_linkage_correctly(other_module, e, name);
 
-			// LLVMSetLinkage(other_g.value, LLVMExternalLinkage);
-
 			if (e->Variable.thread_local_model != "") {
 				LLVMSetThreadLocal(g.value, true);
 
@@ -3088,7 +3094,9 @@ gb_internal lbValue lb_find_value_from_entity(lbModule *m, Entity *e) {
 			return g;
 		}
 	}
-	GB_PANIC("\n\tError in: %s, missing value '%.*s'\n", token_pos_to_string(e->token.pos), LIT(e->token.string));
+
+	GB_PANIC("\n\tError in: %s, missing value '%.*s' in module %s\n",
+	         token_pos_to_string(e->token.pos), LIT(e->token.string), m->module_name);
 	return {};
 }
 
diff --git a/src/llvm_backend_opt.cpp b/src/llvm_backend_opt.cpp
index e6ccc9a57..7fe1359b4 100644
--- a/src/llvm_backend_opt.cpp
+++ b/src/llvm_backend_opt.cpp
@@ -396,7 +396,7 @@ gb_internal LLVMValueRef lb_run_instrumentation_pass_insert_call(lbProcedure *p,
 	lbValue cc = lb_find_procedure_value_from_entity(m, entity);
 
 	LLVMValueRef args[3] = {};
-	args[0] = p->value;
+	args[0] = LLVMConstPointerCast(p->value, lb_type(m, t_rawptr));
 
 	if (is_arch_wasm()) {
 		args[1] = LLVMConstPointerNull(lb_type(m, t_rawptr));
diff --git a/src/llvm_backend_proc.cpp b/src/llvm_backend_proc.cpp
index 610c34de2..2f736ff6c 100644
--- a/src/llvm_backend_proc.cpp
+++ b/src/llvm_backend_proc.cpp
@@ -159,6 +159,11 @@ gb_internal lbProcedure *lb_create_procedure(lbModule *m, Entity *entity, bool i
 	case ProcInlining_no_inline:
 		lb_add_attribute_to_proc(m, p->value, "noinline");
 		break;
+	default:
+		if (build_context.internal_no_inline) {
+			lb_add_attribute_to_proc(m, p->value, "noinline");
+			break;
+		}
 	}
 
 	switch (entity->Procedure.optimization_mode) {
@@ -253,6 +258,11 @@ gb_internal lbProcedure *lb_create_procedure(lbModule *m, Entity *entity, bool i
 			if (e->flags&EntityFlag_NoAlias) {
 				lb_add_proc_attribute_at_index(p, offset+parameter_index, "noalias");
 			}
+			if (e->flags&EntityFlag_NoCapture) {
+				if (is_type_internally_pointer_like(e->type)) {
+					lb_add_proc_attribute_at_index(p, offset+parameter_index, "nocapture");
+				}
+			}
 			parameter_index += 1;
 		}
 	}
@@ -517,6 +527,7 @@ gb_internal void lb_begin_procedure_body(lbProcedure *p) {
 	lb_start_block(p, p->entry_block);
 
 	map_init(&p->direct_parameters);
+	p->variadic_reuses.allocator = heap_allocator();
 
 	GB_ASSERT(p->type != nullptr);
 
@@ -2274,6 +2285,39 @@ gb_internal lbValue lb_build_builtin_proc(lbProcedure *p, Ast *expr, TypeAndValu
 			return res;
 		}
 
+	case BuiltinProc_add_sat:
+	case BuiltinProc_sub_sat:
+		{
+			Type *main_type = tv.type;
+			Type *type = main_type;
+
+			lbValue x = lb_build_expr(p, ce->args[0]);
+			lbValue y = lb_build_expr(p, ce->args[1]);
+			x = lb_emit_conv(p, x, type);
+			y = lb_emit_conv(p, y, type);
+
+			char const *name = nullptr;
+			if (is_type_unsigned(type)) {
+				switch (id) {
+				case BuiltinProc_add_sat: name = "llvm.uadd.sat"; break;
+				case BuiltinProc_sub_sat: name = "llvm.usub.sat"; break;
+				}
+			} else {
+				switch (id) {
+				case BuiltinProc_add_sat: name = "llvm.sadd.sat"; break;
+				case BuiltinProc_sub_sat: name = "llvm.ssub.sat"; break;
+				}
+			}
+			LLVMTypeRef types[1] = {lb_type(p->module, type)};
+
+			LLVMValueRef args[2] = { x.value, y.value };
+
+			lbValue res = {};
+			res.value = lb_call_intrinsic(p, name, args, gb_count_of(args), types, gb_count_of(types));
+			res.type = type;
+			return res;
+		}
+
 	case BuiltinProc_sqrt:
 		{
 			Type *type = tv.type;
@@ -3450,17 +3494,67 @@ gb_internal lbValue lb_build_call_expr_internal(lbProcedure *p, Ast *expr) {
 					}
 					isize slice_len = var_args.count;
 					if (slice_len > 0) {
-						lbAddr slice = lb_add_local_generated(p, slice_type, true);
-						lbAddr base_array = lb_add_local_generated(p, alloc_type_array(elem_type, slice_len), true);
+						lbAddr slice = {};
+
+						for (auto const &vr : p->variadic_reuses) {
+							if (are_types_identical(vr.slice_type, slice_type)) {
+								slice = vr.slice_addr;
+								break;
+							}
+						}
+
+						DeclInfo *d = decl_info_of_entity(p->entity);
+						if (d != nullptr && slice.addr.value == nullptr) {
+							for (auto const &vr : d->variadic_reuses) {
+								if (are_types_identical(vr.slice_type, slice_type)) {
+								#if LLVM_VERSION_MAJOR >= 13
+									// NOTE(bill): No point wasting even more memory, just reuse this stack variable too
+									if (p->variadic_reuses.count > 0) {
+										slice = p->variadic_reuses[0].slice_addr;
+									} else {
+										slice = lb_add_local_generated(p, slice_type, true);
+									}
+									// NOTE(bill): Change the underlying type to match the specific type
+									slice.addr.type = alloc_type_pointer(slice_type);
+								#else
+									slice = lb_add_local_generated(p, slice_type, true);
+								#endif
+									array_add(&p->variadic_reuses, lbVariadicReuseSlices{slice_type, slice});
+									break;
+								}
+							}
+						}
+
+						lbValue base_array_ptr = p->variadic_reuse_base_array_ptr.addr;
+						if (base_array_ptr.value == nullptr) {
+							if (d != nullptr) {
+								i64 max_bytes = d->variadic_reuse_max_bytes;
+								i64 max_align = gb_max(d->variadic_reuse_max_align, 16);
+								p->variadic_reuse_base_array_ptr = lb_add_local_generated(p, alloc_type_array(t_u8, max_bytes), true);
+								lb_try_update_alignment(p->variadic_reuse_base_array_ptr.addr, cast(unsigned)max_align);
+								base_array_ptr = p->variadic_reuse_base_array_ptr.addr;
+							} else {
+								base_array_ptr = lb_add_local_generated(p, alloc_type_array(elem_type, slice_len), true).addr;
+							}
+						}
+
+						if (slice.addr.value == nullptr) {
+							slice = lb_add_local_generated(p, slice_type, true);
+						}
+
+						GB_ASSERT(base_array_ptr.value != nullptr);
+						GB_ASSERT(slice.addr.value != nullptr);
+
+						base_array_ptr = lb_emit_conv(p, base_array_ptr, alloc_type_pointer(alloc_type_array(elem_type, slice_len)));
 
 						for (isize i = 0; i < var_args.count; i++) {
-							lbValue addr = lb_emit_array_epi(p, base_array.addr, cast(i32)i);
+							lbValue addr = lb_emit_array_epi(p, base_array_ptr, cast(i32)i);
 							lbValue var_arg = var_args[i];
 							var_arg = lb_emit_conv(p, var_arg, elem_type);
 							lb_emit_store(p, addr, var_arg);
 						}
 
-						lbValue base_elem = lb_emit_array_epi(p, base_array.addr, 0);
+						lbValue base_elem = lb_emit_array_epi(p, base_array_ptr, 0);
 						lbValue len = lb_const_int(p->module, t_int, slice_len);
 						lb_fill_slice(p, slice, base_elem, len);
 
diff --git a/src/llvm_backend_stmt.cpp b/src/llvm_backend_stmt.cpp
index 70b695627..e70cc503e 100644
--- a/src/llvm_backend_stmt.cpp
+++ b/src/llvm_backend_stmt.cpp
@@ -1736,10 +1736,17 @@ gb_internal void lb_build_type_switch_stmt(lbProcedure *p, AstTypeSwitchStmt *ss
 
 	for (Ast *clause : body->stmts) {
 		ast_node(cc, CaseClause, clause);
+
+		Entity *case_entity = implicit_entity_of_node(clause);
 		lb_open_scope(p, cc->scope);
+
 		if (cc->list.count == 0) {
 			lb_start_block(p, default_block);
-			lb_store_type_case_implicit(p, clause, parent_value, true);
+			if (case_entity->flags & EntityFlag_Value) {
+				lb_store_type_case_implicit(p, clause, parent_value, true);
+			} else {
+				lb_store_type_case_implicit(p, clause, parent_ptr, true);
+			}
 			lb_type_case_body(p, ss->label, clause, p->curr_block, done);
 			continue;
 		}
@@ -1769,7 +1776,6 @@ gb_internal void lb_build_type_switch_stmt(lbProcedure *p, AstTypeSwitchStmt *ss
 			LLVMAddCase(switch_instr, on_val.value, body->block);
 		}
 
-		Entity *case_entity = implicit_entity_of_node(clause);
 
 		lb_start_block(p, body);
 
@@ -1782,6 +1788,7 @@ gb_internal void lb_build_type_switch_stmt(lbProcedure *p, AstTypeSwitchStmt *ss
 			} else if (switch_kind == TypeSwitch_Any) {
 				data = lb_emit_load(p, lb_emit_struct_ep(p, parent_ptr, 0));
 			}
+			GB_ASSERT(is_type_pointer(data.type));
 
 			Type *ct = case_entity->type;
 			Type *ct_ptr = alloc_type_pointer(ct);
diff --git a/src/llvm_backend_type.cpp b/src/llvm_backend_type.cpp
index 2c4abbb4d..638170bfc 100644
--- a/src/llvm_backend_type.cpp
+++ b/src/llvm_backend_type.cpp
@@ -421,7 +421,7 @@ gb_internal void lb_setup_type_info_data_giant_array(lbModule *m, i64 global_typ
 			}
 			TokenPos pos = t->Named.type_name->token.pos;
 
-			lbValue loc = lb_const_source_code_location_const(m, proc_name, pos);
+			lbValue loc = lb_const_source_code_location_as_global_ptr(m, proc_name, pos);
 
 			LLVMValueRef vals[4] = {
 				lb_const_string(m, t->Named.type_name->token.string).value,
@@ -810,19 +810,18 @@ gb_internal void lb_setup_type_info_data_giant_array(lbModule *m, i64 global_typ
 		case Type_Struct: {
 			tag_type = t_type_info_struct;
 
-			LLVMValueRef vals[13] = {};
+			LLVMValueRef vals[11] = {};
 
 			{
-				lbValue is_packed       = lb_const_bool(m, t_bool, t->Struct.is_packed);
-				lbValue is_raw_union    = lb_const_bool(m, t_bool, t->Struct.is_raw_union);
-				lbValue is_no_copy      = lb_const_bool(m, t_bool, t->Struct.is_no_copy);
-				lbValue is_custom_align = lb_const_bool(m, t_bool, t->Struct.custom_align != 0);
-				vals[5] = is_packed.value;
-				vals[6] = is_raw_union.value;
-				vals[7] = is_no_copy.value;
-				vals[8] = is_custom_align.value;
+				u8 flags = 0;
+				if (t->Struct.is_packed)    flags |= 1<<0;
+				if (t->Struct.is_raw_union) flags |= 1<<1;
+				if (t->Struct.is_no_copy)   flags |= 1<<2;
+				if (t->Struct.custom_align) flags |= 1<<3;
+
+				vals[6] = lb_const_int(m, t_u8, flags).value;
 				if (is_type_comparable(t) && !is_type_simple_compare(t)) {
-					vals[9] = lb_equal_proc_for_type(m, t).value;
+					vals[10] = lb_equal_proc_for_type(m, t).value;
 				}
 
 
@@ -831,11 +830,11 @@ gb_internal void lb_setup_type_info_data_giant_array(lbModule *m, i64 global_typ
 
 					lbValue soa_kind = lb_const_value(m, kind_type, exact_value_i64(t->Struct.soa_kind));
 					LLVMValueRef soa_type = get_type_info_ptr(m, t->Struct.soa_elem);
-					lbValue soa_len = lb_const_int(m, t_int, t->Struct.soa_count);
+					lbValue soa_len = lb_const_int(m, t_i32, t->Struct.soa_count);
 
-					vals[10] = soa_kind.value;
-					vals[11] = soa_type;
-					vals[12] = soa_len.value;
+					vals[7] = soa_kind.value;
+					vals[8] = soa_len.value;
+					vals[9] = soa_type;
 				}
 			}
 
@@ -882,12 +881,13 @@ gb_internal void lb_setup_type_info_data_giant_array(lbModule *m, i64 global_typ
 
 				}
 
-				lbValue cv = lb_const_int(m, t_int, count);
-				vals[0] = llvm_const_slice(m, memory_types,   cv);
-				vals[1] = llvm_const_slice(m, memory_names,   cv);
-				vals[2] = llvm_const_slice(m, memory_offsets, cv);
-				vals[3] = llvm_const_slice(m, memory_usings,  cv);
-				vals[4] = llvm_const_slice(m, memory_tags,    cv);
+				lbValue cv = lb_const_int(m, t_i32, count);
+				vals[0] = memory_types.value;
+				vals[1] = memory_names.value;
+				vals[2] = memory_offsets.value;
+				vals[3] = memory_usings.value;
+				vals[4] = memory_tags.value;
+				vals[5] = cv.value;
 			}
 			for (isize i = 0; i < gb_count_of(vals); i++) {
 				if (vals[i] == nullptr) {
@@ -994,7 +994,7 @@ gb_internal void lb_setup_type_info_data_giant_array(lbModule *m, i64 global_typ
 			{
 				tag_type = t_type_info_bit_field;
 
-				LLVMValueRef vals[6] = {};
+				LLVMValueRef vals[7] = {};
 				vals[0] = get_type_info_ptr(m, t->BitField.backing_type);
 				isize count = t->BitField.fields.count;
 				if (count > 0) {
@@ -1035,11 +1035,12 @@ gb_internal void lb_setup_type_info_data_giant_array(lbModule *m, i64 global_typ
 					}
 
 					lbValue cv = lb_const_int(m, t_int, count);
-					vals[1] = llvm_const_slice(m, memory_names,       cv);
-					vals[2] = llvm_const_slice(m, memory_types,       cv);
-					vals[3] = llvm_const_slice(m, memory_bit_sizes,   cv);
-					vals[4] = llvm_const_slice(m, memory_bit_offsets, cv);
-					vals[5] = llvm_const_slice(m, memory_tags,        cv);
+					vals[1] =  memory_names.value;
+					vals[2] =  memory_types.value;
+					vals[3] =  memory_bit_sizes.value;
+					vals[4] =  memory_bit_offsets.value;
+					vals[5] =  memory_tags.value;
+					vals[6] =  cv.value;
 				}
 
 
diff --git a/src/main.cpp b/src/main.cpp
index e6a0aecf0..41a95338b 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -394,6 +394,7 @@ enum BuildFlagKind {
 	BuildFlag_InternalIgnorePanic,
 	BuildFlag_InternalModulePerFile,
 	BuildFlag_InternalCached,
+	BuildFlag_InternalNoInline,
 
 	BuildFlag_Tilde,
 
@@ -598,6 +599,7 @@ gb_internal bool parse_build_flags(Array<String> args) {
 	add_flag(&build_flags, BuildFlag_InternalIgnorePanic,     str_lit("internal-ignore-panic"),     BuildFlagParam_None,    Command_all);
 	add_flag(&build_flags, BuildFlag_InternalModulePerFile,   str_lit("internal-module-per-file"),  BuildFlagParam_None,    Command_all);
 	add_flag(&build_flags, BuildFlag_InternalCached,          str_lit("internal-cached"),           BuildFlagParam_None,    Command_all);
+	add_flag(&build_flags, BuildFlag_InternalNoInline,        str_lit("internal-no-inline"),        BuildFlagParam_None,    Command_all);
 
 #if ALLOW_TILDE
 	add_flag(&build_flags, BuildFlag_Tilde,                   str_lit("tilde"),                     BuildFlagParam_None,    Command__does_build);
@@ -605,6 +607,7 @@ gb_internal bool parse_build_flags(Array<String> args) {
 
 	add_flag(&build_flags, BuildFlag_Sanitize,                str_lit("sanitize"),                  BuildFlagParam_String,  Command__does_build, true);
 
+
 #if defined(GB_SYSTEM_WINDOWS)
 	add_flag(&build_flags, BuildFlag_IgnoreVsSearch,          str_lit("ignore-vs-search"),          BuildFlagParam_None,    Command__does_build);
 	add_flag(&build_flags, BuildFlag_ResourceFile,            str_lit("resource"),                  BuildFlagParam_String,  Command__does_build);
@@ -1416,11 +1419,15 @@ gb_internal bool parse_build_flags(Array<String> args) {
 							break;
 						case BuildFlag_InternalModulePerFile:
 							build_context.module_per_file = true;
+							build_context.use_separate_modules = true;
 							break;
 						case BuildFlag_InternalCached:
 							build_context.cached = true;
 							build_context.use_separate_modules = true;
 							break;
+						case BuildFlag_InternalNoInline:
+							build_context.internal_no_inline = true;
+							break;
 
 						case BuildFlag_Tilde:
 							build_context.tilde_backend = true;
@@ -1441,6 +1448,7 @@ gb_internal bool parse_build_flags(Array<String> args) {
 							}
 							break;
 
+
 					#if defined(GB_SYSTEM_WINDOWS)
 						case BuildFlag_IgnoreVsSearch: {
 							GB_ASSERT(value.kind == ExactValue_Invalid);
@@ -2164,7 +2172,7 @@ gb_internal void print_show_help(String const arg0, String const &command) {
 		if (LB_USE_NEW_PASS_SYSTEM) {
 			print_usage_line(3, "-o:aggressive");
 		}
-		print_usage_line(2, "The default is -o:none.");
+		print_usage_line(2, "The default is -o:minimal.");
 		print_usage_line(0, "");
 	}
 
@@ -2310,9 +2318,9 @@ gb_internal void print_show_help(String const arg0, String const &command) {
 		print_usage_line(0, "");
 
 		print_usage_line(1, "-use-separate-modules");
-		print_usage_line(1, "[EXPERIMENTAL]");
 		print_usage_line(2, "The backend generates multiple build units which are then linked together.");
 		print_usage_line(2, "Normally, a single build unit is generated for a standard project.");
+		print_usage_line(2, "This is the default behaviour on Windows for '-o:none' and '-o:minimal' builds.");
 		print_usage_line(0, "");
 
 	}
diff --git a/src/parser.cpp b/src/parser.cpp
index 9ce3d563d..aba2b8276 100644
--- a/src/parser.cpp
+++ b/src/parser.cpp
@@ -112,17 +112,17 @@ gb_internal isize ast_node_size(AstKind kind) {
 
 }
 
-gb_global std::atomic<isize> global_total_node_memory_allocated;
+// gb_global std::atomic<isize> global_total_node_memory_allocated;
 
 // NOTE(bill): And this below is why is I/we need a new language! Discriminated unions are a pain in C/C++
 gb_internal Ast *alloc_ast_node(AstFile *f, AstKind kind) {
 	isize size = ast_node_size(kind);
 
-	Ast *node = cast(Ast *)arena_alloc(&global_thread_local_ast_arena, size, 16);
+	Ast *node = cast(Ast *)arena_alloc(get_arena(ThreadArena_Permanent), size, 16);
 	node->kind = kind;
 	node->file_id = f ? f->id : 0;
 
-	global_total_node_memory_allocated.fetch_add(size);
+	// global_total_node_memory_allocated.fetch_add(size);
 
 	return node;
 }
@@ -4014,6 +4014,7 @@ struct ParseFieldPrefixMapping {
 gb_global ParseFieldPrefixMapping const parse_field_prefix_mappings[] = {
 	{str_lit("using"),        Token_using,     FieldFlag_using},
 	{str_lit("no_alias"),     Token_Hash,      FieldFlag_no_alias},
+	{str_lit("no_capture"),   Token_Hash,      FieldFlag_no_capture},
 	{str_lit("c_vararg"),     Token_Hash,      FieldFlag_c_vararg},
 	{str_lit("const"),        Token_Hash,      FieldFlag_const},
 	{str_lit("any_int"),      Token_Hash,      FieldFlag_any_int},
@@ -5412,7 +5413,7 @@ gb_internal ParseFileError init_ast_file(AstFile *f, String const &fullpath, Tok
 	if (!string_ends_with(f->fullpath, str_lit(".odin"))) {
 		return ParseFile_WrongExtension;
 	}
-	zero_item(&f->tokenizer);
+	gb_zero_item(&f->tokenizer);
 	f->tokenizer.curr_file_id = f->id;
 
 	TokenizerInitError err = init_tokenizer_from_fullpath(&f->tokenizer, f->fullpath, build_context.copy_file_contents);
@@ -5608,7 +5609,7 @@ gb_internal AstPackage *try_add_import_path(Parser *p, String path, String const
 	pkg->foreign_files.allocator = permanent_allocator();
 
 	// NOTE(bill): Single file initial package
-	if (kind == Package_Init && string_ends_with(path, FILE_EXT)) {
+	if (kind == Package_Init && !path_is_directory(path) && string_ends_with(path, FILE_EXT)) {
 		FileInfo fi = {};
 		fi.name = filename_from_path(path);
 		fi.fullpath = path;
@@ -6528,6 +6529,7 @@ gb_internal ParseFileError parse_packages(Parser *p, String init_filename) {
 	GB_ASSERT(init_filename.text[init_filename.len] == 0);
 
 	String init_fullpath = path_to_full_path(permanent_allocator(), init_filename);
+
 	if (!path_is_directory(init_fullpath)) {
 		String const ext = str_lit(".odin");
 		if (!string_ends_with(init_fullpath, ext)) {
@@ -6541,9 +6543,8 @@ gb_internal ParseFileError parse_packages(Parser *p, String init_filename) {
 		}
 		if ((build_context.command_kind & Command__does_build) &&
 		    build_context.build_mode == BuildMode_Executable) {
-			String short_path = filename_from_path(path);
-			char *cpath = alloc_cstring(temporary_allocator(), short_path);
-			if (gb_file_exists(cpath)) {
+			String output_path = path_to_string(temporary_allocator(), build_context.build_paths[8]);
+			if (path_is_directory(output_path)) {
 			    	error({}, "Please specify the executable name with -out:<string> as a directory exists with the same name in the current working directory");
 			    	return ParseFile_DirectoryAlreadyExists;
 			}
diff --git a/src/parser.hpp b/src/parser.hpp
index 86b3393af..565a8e621 100644
--- a/src/parser.hpp
+++ b/src/parser.hpp
@@ -331,8 +331,10 @@ enum FieldFlag : u32 {
 	FieldFlag_by_ptr    = 1<<8,
 	FieldFlag_no_broadcast = 1<<9, // disallow array programming
 
+	FieldFlag_no_capture  = 1<<11,
+
 	// Internal use by the parser only
-	FieldFlag_Tags      = 1<<10,
+	FieldFlag_Tags      = 1<<15,
 	FieldFlag_Results   = 1<<16,
 
 
@@ -340,7 +342,10 @@ enum FieldFlag : u32 {
 	FieldFlag_Invalid   = 1u<<31,
 
 	// Parameter List Restrictions
-	FieldFlag_Signature = FieldFlag_ellipsis|FieldFlag_using|FieldFlag_no_alias|FieldFlag_c_vararg|FieldFlag_const|FieldFlag_any_int|FieldFlag_by_ptr|FieldFlag_no_broadcast,
+	FieldFlag_Signature = FieldFlag_ellipsis|FieldFlag_using|FieldFlag_no_alias|FieldFlag_c_vararg|
+	                      FieldFlag_const|FieldFlag_any_int|FieldFlag_by_ptr|FieldFlag_no_broadcast|
+	                      FieldFlag_no_capture,
+
 	FieldFlag_Struct    = FieldFlag_using|FieldFlag_subtype|FieldFlag_Tags,
 };
 
@@ -873,10 +878,8 @@ gb_internal gb_inline bool is_ast_when_stmt(Ast *node) {
 	return node->kind == Ast_WhenStmt;
 }
 
-gb_global gb_thread_local Arena global_thread_local_ast_arena = {};
-
 gb_internal gb_inline gbAllocator ast_allocator(AstFile *f) {
-	return arena_allocator(&global_thread_local_ast_arena);
+	return permanent_allocator();
 }
 
 gb_internal Ast *alloc_ast_node(AstFile *f, AstKind kind);
diff --git a/src/queue.cpp b/src/queue.cpp
index 2ad9cb29f..dee9ad1f8 100644
--- a/src/queue.cpp
+++ b/src/queue.cpp
@@ -16,7 +16,7 @@ struct MPSCQueue {
 	std::atomic<isize> count;
 };
 
-template <typename T> gb_internal void  mpsc_init   (MPSCQueue<T> *q);
+template <typename T> gb_internal void  mpsc_init   (MPSCQueue<T> *q, gbAllocator const &allocator);
 template <typename T> gb_internal void  mpsc_destroy(MPSCQueue<T> *q);
 template <typename T> gb_internal isize mpsc_enqueue(MPSCQueue<T> *q, T const &value);
 template <typename T> gb_internal bool  mpsc_dequeue(MPSCQueue<T> *q, T *value_);
diff --git a/src/thread_pool.cpp b/src/thread_pool.cpp
index 5dbbe37c4..8363a4553 100644
--- a/src/thread_pool.cpp
+++ b/src/thread_pool.cpp
@@ -3,20 +3,28 @@
 struct WorkerTask;
 struct ThreadPool;
 
-gb_thread_local Thread *current_thread;
+gb_global gb_thread_local Thread *current_thread;
+gb_internal Thread *get_current_thread(void) {
+	return current_thread;
+}
 
 gb_internal void thread_pool_init(ThreadPool *pool, isize worker_count, char const *worker_name);
 gb_internal void thread_pool_destroy(ThreadPool *pool);
 gb_internal bool thread_pool_add_task(ThreadPool *pool, WorkerTaskProc *proc, void *data);
 gb_internal void thread_pool_wait(ThreadPool *pool);
 
+enum GrabState {
+	Grab_Success = 0,
+	Grab_Empty   = 1,
+	Grab_Failed  = 2,
+};
+
 struct ThreadPool {
-	gbAllocator threads_allocator;
-	Slice<Thread> threads;
+	gbAllocator       threads_allocator;
+	Slice<Thread>     threads;
 	std::atomic<bool> running;
 
 	Futex tasks_available;
-
 	Futex tasks_left;
 };
 
@@ -46,7 +54,7 @@ gb_internal void thread_pool_destroy(ThreadPool *pool) {
 
 	for_array_off(i, 1, pool->threads) {
 		Thread *t = &pool->threads[i];
-		pool->tasks_available.fetch_add(1, std::memory_order_relaxed);
+		pool->tasks_available.fetch_add(1, std::memory_order_acquire);
 		futex_broadcast(&pool->tasks_available);
 		thread_join_and_destroy(t);
 	}
@@ -54,51 +62,86 @@ gb_internal void thread_pool_destroy(ThreadPool *pool) {
 	gb_free(pool->threads_allocator, pool->threads.data);
 }
 
+TaskRingBuffer *task_ring_grow(TaskRingBuffer *ring, isize bottom, isize top) {
+	TaskRingBuffer *new_ring = task_ring_init(ring->size * 2);
+	for (isize i = top; i < bottom; i++) {
+		new_ring->buffer[i % new_ring->size] = ring->buffer[i % ring->size];
+	}
+	return new_ring;
+}
+
 void thread_pool_queue_push(Thread *thread, WorkerTask task) {
-	u64 capture;
-	u64 new_capture;
-	do {
-		capture = thread->head_and_tail.load();
+	isize bot                = thread->queue.bottom.load(std::memory_order_relaxed);
+	isize top                = thread->queue.top.load(std::memory_order_acquire);
+	TaskRingBuffer *cur_ring   = thread->queue.ring.load(std::memory_order_relaxed);
 
-		u64 mask = thread->capacity - 1;
-		u64 head = (capture >> 32) & mask;
-		u64 tail = ((u32)capture) & mask;
+	isize size = bot - top;
+	if (size > (cur_ring->size - 1)) {
+		// Queue is full
+		thread->queue.ring = task_ring_grow(thread->queue.ring, bot, top);
+		cur_ring = thread->queue.ring.load(std::memory_order_relaxed);
+	}
 
-		u64 new_head = (head + 1) & mask;
-		GB_ASSERT_MSG(new_head != tail, "Thread Queue Full!");
-
-		// This *must* be done in here, to avoid a potential race condition where we no longer own the slot by the time we're assigning
-		thread->queue[head] = task;
-		new_capture = (new_head << 32) | tail;
-	} while (!thread->head_and_tail.compare_exchange_weak(capture, new_capture));
+	cur_ring->buffer[bot % cur_ring->size] = task;
+	std::atomic_thread_fence(std::memory_order_release);
+	thread->queue.bottom.store(bot + 1, std::memory_order_relaxed);
 
 	thread->pool->tasks_left.fetch_add(1, std::memory_order_release);
 	thread->pool->tasks_available.fetch_add(1, std::memory_order_relaxed);
 	futex_broadcast(&thread->pool->tasks_available);
 }
 
-bool thread_pool_queue_pop(Thread *thread, WorkerTask *task) {
-	u64 capture;
-	u64 new_capture;
-	do {
-		capture = thread->head_and_tail.load(std::memory_order_acquire);
+GrabState thread_pool_queue_take(Thread *thread, WorkerTask *task) {
+	isize bot = thread->queue.bottom.load(std::memory_order_relaxed) - 1;
+	TaskRingBuffer *cur_ring = thread->queue.ring.load(std::memory_order_relaxed);
+	thread->queue.bottom.store(bot, std::memory_order_relaxed);
+	std::atomic_thread_fence(std::memory_order_seq_cst);
 
-		u64 mask = thread->capacity - 1;
-		u64 head = (capture >> 32) & mask;
-		u64 tail = ((u32)capture) & mask;
+	isize top = thread->queue.top.load(std::memory_order_relaxed);
+	if (top <= bot) {
 
-		u64 new_tail = (tail + 1) & mask;
-		if (tail == head) {
-			return false;
+		// Queue is not empty
+		*task = cur_ring->buffer[bot % cur_ring->size];
+		if (top == bot) {
+			// Only one entry left in queue
+			if (!thread->queue.top.compare_exchange_strong(top, top + 1, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+				// Race failed
+				thread->queue.bottom.store(bot + 1, std::memory_order_relaxed);
+				return Grab_Empty;
+			}
+
+			thread->queue.bottom.store(bot + 1, std::memory_order_relaxed);
+			return Grab_Success;
 		}
 
-		// Making a copy of the task before we increment the tail, avoiding the same potential race condition as above
-		*task = thread->queue[tail];
+		// We got a task without hitting a race
+		return Grab_Success;
+	} else {
+		// Queue is empty
+		thread->queue.bottom.store(bot + 1, std::memory_order_relaxed);
+		return Grab_Empty;
+	}
+}
 
-		new_capture = (head << 32) | new_tail;
-	} while (!thread->head_and_tail.compare_exchange_weak(capture, new_capture, std::memory_order_release));
+GrabState thread_pool_queue_steal(Thread *thread, WorkerTask *task) {
+	isize top = thread->queue.top.load(std::memory_order_acquire);
+	std::atomic_thread_fence(std::memory_order_seq_cst);
+	isize bot = thread->queue.bottom.load(std::memory_order_acquire);
 
-	return true;
+	GrabState ret = Grab_Empty;
+	if (top < bot) {
+		// Queue is not empty
+		TaskRingBuffer *cur_ring = thread->queue.ring.load(std::memory_order_consume);
+		*task = cur_ring->buffer[top % cur_ring->size];
+
+		if (!thread->queue.top.compare_exchange_strong(top, top + 1, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+			// Race failed
+			ret = Grab_Failed;
+		} else {
+			ret = Grab_Success;
+		}
+	}
+	return ret;
 }
 
 gb_internal bool thread_pool_add_task(ThreadPool *pool, WorkerTaskProc *proc, void *data) {
@@ -115,12 +158,11 @@ gb_internal void thread_pool_wait(ThreadPool *pool) {
 
 	while (pool->tasks_left.load(std::memory_order_acquire)) {
 		// if we've got tasks on our queue, run them
-		while (thread_pool_queue_pop(current_thread, &task)) {
+		while (!thread_pool_queue_take(current_thread, &task)) {
 			task.do_work(task.data);
 			pool->tasks_left.fetch_sub(1, std::memory_order_release);
 		}
 
-
 		// is this mem-barriered enough?
 		// This *must* be executed in this order, so the futex wakes immediately
 		// if rem_tasks has changed since we checked last, otherwise the program
@@ -145,7 +187,7 @@ gb_internal THREAD_PROC(thread_pool_thread_proc) {
 		usize finished_tasks = 0;
 		i32 state;
 
-		while (thread_pool_queue_pop(current_thread, &task)) {
+		while (!thread_pool_queue_take(current_thread, &task)) {
 			task.do_work(task.data);
 			pool->tasks_left.fetch_sub(1, std::memory_order_release);
 
@@ -167,7 +209,12 @@ gb_internal THREAD_PROC(thread_pool_thread_proc) {
 
 				Thread *thread = &pool->threads.data[idx];
 				WorkerTask task;
-				if (thread_pool_queue_pop(thread, &task)) {
+
+				GrabState ret = thread_pool_queue_steal(thread, &task);
+				switch (ret) {
+				case Grab_Empty:
+					continue;
+				case Grab_Success:
 					task.do_work(task.data);
 					pool->tasks_left.fetch_sub(1, std::memory_order_release);
 
@@ -175,6 +222,8 @@ gb_internal THREAD_PROC(thread_pool_thread_proc) {
 						futex_signal(&pool->tasks_left);
 					}
 
+					/*fallthrough*/
+				case Grab_Failed:
 					goto main_loop_continue;
 				}
 			}
@@ -182,6 +231,7 @@ gb_internal THREAD_PROC(thread_pool_thread_proc) {
 
 		// if we've done all our work, and there's nothing to steal, go to sleep
 		state = pool->tasks_available.load(std::memory_order_acquire);
+		if (!pool->running) { break; }
 		futex_wait(&pool->tasks_available, state);
 
 		main_loop_continue:;
diff --git a/src/threading.cpp b/src/threading.cpp
index 717dcb874..011b66028 100644
--- a/src/threading.cpp
+++ b/src/threading.cpp
@@ -46,6 +46,18 @@ typedef struct WorkerTask {
 	void           *data;
 } WorkerTask;
 
+typedef struct TaskRingBuffer {
+	std::atomic<isize> size;
+	std::atomic<WorkerTask *> buffer;
+} TaskRingBuffer;
+
+typedef struct TaskQueue {
+	std::atomic<isize> top;
+	std::atomic<isize> bottom;
+
+	std::atomic<TaskRingBuffer *> ring;
+} TaskQueue;
+
 struct Thread {
 #if defined(GB_SYSTEM_WINDOWS)
 	void *win32_handle;
@@ -54,13 +66,13 @@ struct Thread {
 #endif
 
 	isize idx;
+	isize stack_size;
 
-	WorkerTask *queue;
-	size_t capacity;
-	std::atomic<uint64_t> head_and_tail;
-
-	isize  stack_size;
+	struct TaskQueue   queue;
 	struct ThreadPool *pool;
+
+	struct Arena *permanent_arena;
+	struct Arena *temporary_arena;
 };
 
 typedef std::atomic<i32> Futex;
@@ -551,6 +563,20 @@ gb_internal void *internal_thread_proc(void *arg) {
 }
 #endif
 
+gb_internal TaskRingBuffer *task_ring_init(isize size) {
+	TaskRingBuffer *ring = gb_alloc_item(heap_allocator(), TaskRingBuffer);
+	ring->size = size;
+	ring->buffer = gb_alloc_array(heap_allocator(), WorkerTask, ring->size);
+	return ring;
+}
+
+gb_internal void thread_queue_destroy(TaskQueue *q) {
+	gb_free(heap_allocator(), (*q->ring).buffer);
+	gb_free(heap_allocator(), q->ring);
+}
+
+gb_internal void thread_init_arenas(Thread *t);
+
 gb_internal void thread_init(ThreadPool *pool, Thread *t, isize idx) {
 	gb_zero_item(t);
 #if defined(GB_SYSTEM_WINDOWS)
@@ -559,13 +585,13 @@ gb_internal void thread_init(ThreadPool *pool, Thread *t, isize idx) {
 	t->posix_handle = 0;
 #endif
 
-	t->capacity = 1 << 14; // must be a power of 2
-	t->queue = gb_alloc_array(heap_allocator(), WorkerTask, t->capacity);
-	t->head_and_tail = 0;
+	// Size must be a power of 2
+	t->queue.ring = task_ring_init(1 << 14);
 	t->pool = pool;
 	t->idx = idx;
-}
 
+	thread_init_arenas(t);
+}
 
 gb_internal void thread_init_and_start(ThreadPool *pool, Thread *t, isize idx) {
 	thread_init(pool, t, idx);
@@ -598,7 +624,7 @@ gb_internal void thread_join_and_destroy(Thread *t) {
 	t->posix_handle = 0;
 #endif
 
-	gb_free(heap_allocator(), t->queue);
+	thread_queue_destroy(&t->queue);
 }
 
 gb_internal void thread_set_name(Thread *t, char const *name) {
@@ -770,13 +796,27 @@ gb_internal void futex_wait(Futex *f, Footex val) {
 
 #elif defined(GB_SYSTEM_OSX)
 
+// IMPORTANT NOTE(laytan): We use `OS_SYNC_*_SHARED` and `UL_COMPARE_AND_WAIT_SHARED` flags here.
+// these flags tell the kernel that we are using these futexes across different processes which
+// causes it to opt-out of some optimisations.
+//
+// BUT this is not actually the case! We should be using the normal non-shared version and letting
+// the kernel optimize (I've measured it to be about 10% faster at the parsing/type checking stages).
+//
+// However we have reports of people on MacOS running into kernel panics, and this seems to fix it for them.
+// Which means there is probably a bug in the kernel in one of these non-shared optimisations causing the panic.
+//
+// The panic also doesn't seem to happen on normal M1 CPUs, and happen more on later CPUs or pro/max series.
+// Probably because they have more going on in terms of threads etc.
+
 #if __has_include(<os/os_sync_wait_on_address.h>)
 	#define DARWIN_WAIT_ON_ADDRESS_AVAILABLE
 	#include <os/os_sync_wait_on_address.h>
 #endif
 
-#define UL_COMPARE_AND_WAIT	0x00000001
-#define ULF_NO_ERRNO        0x01000000
+#define UL_COMPARE_AND_WAIT        0x00000001
+#define UL_COMPARE_AND_WAIT_SHARED 0x00000003
+#define ULF_NO_ERRNO               0x01000000
 
 extern "C" int __ulock_wait(uint32_t operation, void *addr, uint64_t value, uint32_t timeout); /* timeout is specified in microseconds */
 extern "C" int __ulock_wake(uint32_t operation, void *addr, uint64_t wake_value);
@@ -785,7 +825,7 @@ gb_internal void futex_signal(Futex *f) {
 	#ifdef DARWIN_WAIT_ON_ADDRESS_AVAILABLE
 	if (__builtin_available(macOS 14.4, *)) {
 		for (;;) {
-			int ret = os_sync_wake_by_address_any(f, sizeof(Futex), OS_SYNC_WAKE_BY_ADDRESS_NONE);
+			int ret = os_sync_wake_by_address_any(f, sizeof(Futex), OS_SYNC_WAKE_BY_ADDRESS_SHARED);
 			if (ret >= 0) {
 				return;
 			}
@@ -800,7 +840,7 @@ gb_internal void futex_signal(Futex *f) {
 	} else {
 	#endif
 	for (;;) {
-		int ret = __ulock_wake(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO, f, 0);
+		int ret = __ulock_wake(UL_COMPARE_AND_WAIT_SHARED | ULF_NO_ERRNO, f, 0);
 		if (ret >= 0) {
 			return;
 		}
@@ -821,7 +861,7 @@ gb_internal void futex_broadcast(Futex *f) {
 	#ifdef DARWIN_WAIT_ON_ADDRESS_AVAILABLE
 	if (__builtin_available(macOS 14.4, *)) {
 		for (;;) {
-			int ret = os_sync_wake_by_address_all(f, sizeof(Footex), OS_SYNC_WAKE_BY_ADDRESS_NONE);
+			int ret = os_sync_wake_by_address_all(f, sizeof(Footex), OS_SYNC_WAKE_BY_ADDRESS_SHARED);
 			if (ret >= 0) {
 				return;
 			}
@@ -837,7 +877,7 @@ gb_internal void futex_broadcast(Futex *f) {
 	#endif
 	for (;;) {
 		enum { ULF_WAKE_ALL = 0x00000100 };
-		int ret = __ulock_wake(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO | ULF_WAKE_ALL, f, 0);
+		int ret = __ulock_wake(UL_COMPARE_AND_WAIT_SHARED | ULF_NO_ERRNO | ULF_WAKE_ALL, f, 0);
 		if (ret == 0) {
 			return;
 		}
@@ -858,7 +898,7 @@ gb_internal void futex_wait(Futex *f, Footex val) {
 	#ifdef DARWIN_WAIT_ON_ADDRESS_AVAILABLE
 	if (__builtin_available(macOS 14.4, *)) {
 		for (;;) {
-			int ret = os_sync_wait_on_address(f, cast(uint64_t)(val), sizeof(Footex), OS_SYNC_WAIT_ON_ADDRESS_NONE);
+			int ret = os_sync_wait_on_address(f, cast(uint64_t)(val), sizeof(Footex), OS_SYNC_WAIT_ON_ADDRESS_SHARED);
 			if (ret >= 0) {
 				if (*f != val) {
 					return;
@@ -876,7 +916,7 @@ gb_internal void futex_wait(Futex *f, Footex val) {
 	} else {
 	#endif
 	for (;;) {
-		int ret = __ulock_wait(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO, f, val, 0);
+		int ret = __ulock_wait(UL_COMPARE_AND_WAIT_SHARED | ULF_NO_ERRNO, f, val, 0);
 		if (ret >= 0) {
 			if (*f != val) {
 				return;
diff --git a/src/types.cpp b/src/types.cpp
index c3a5fb539..944760142 100644
--- a/src/types.cpp
+++ b/src/types.cpp
@@ -964,7 +964,7 @@ gb_internal Type *alloc_type(TypeKind kind) {
 	// gbAllocator a = heap_allocator();
 	gbAllocator a = permanent_allocator();
 	Type *t = gb_alloc_item(a, Type);
-	zero_item(t);
+	gb_zero_item(t);
 	t->kind = kind;
 	t->cached_size  = -1;
 	t->cached_align = -1;
@@ -1637,6 +1637,26 @@ gb_internal Type *base_array_type(Type *t) {
 	return t;
 }
 
+
+gb_internal Type *base_any_array_type(Type *t) {
+	Type *bt = base_type(t);
+	if (is_type_array(bt)) {
+		return bt->Array.elem;
+	} else if (is_type_slice(bt)) {
+		return bt->Slice.elem;
+	} else if (is_type_dynamic_array(bt)) {
+		return bt->DynamicArray.elem;
+	} else if (is_type_enumerated_array(bt)) {
+		return bt->EnumeratedArray.elem;
+	} else if (is_type_simd_vector(bt)) {
+		return bt->SimdVector.elem;
+	} else if (is_type_matrix(bt)) {
+		return bt->Matrix.elem;
+	}
+	return t;
+}
+
+
 gb_internal bool is_type_generic(Type *t) {
 	t = base_type(t);
 	return t->kind == Type_Generic;
@@ -2011,6 +2031,24 @@ gb_internal bool is_type_valid_bit_set_elem(Type *t) {
 	return false;
 }
 
+
+gb_internal bool is_valid_bit_field_backing_type(Type *type) {
+	if (type == nullptr) {
+		return false;
+	}
+	type = base_type(type);
+	if (is_type_untyped(type)) {
+		return false;
+	}
+	if (is_type_integer(type)) {
+		return true;
+	}
+	if (type->kind == Type_Array) {
+		return is_type_integer(type->Array.elem);
+	}
+	return false;
+}
+
 gb_internal Type *bit_set_to_int(Type *t) {
 	GB_ASSERT(is_type_bit_set(t));
 	Type *bt = base_type(t);
@@ -2018,6 +2056,9 @@ gb_internal Type *bit_set_to_int(Type *t) {
 	if (underlying != nullptr && is_type_integer(underlying)) {
 		return underlying;
 	}
+	if (underlying != nullptr && is_valid_bit_field_backing_type(underlying)) {
+		return underlying;
+	}
 
 	i64 sz = type_size_of(t);
 	switch (sz) {
@@ -2923,11 +2964,14 @@ gb_internal Type *c_vararg_promote_type(Type *type) {
 
 	if (core->kind == Type_Basic) {
 		switch (core->Basic.kind) {
+		case Basic_f16:
 		case Basic_f32:
 		case Basic_UntypedFloat:
 			return t_f64;
+		case Basic_f16le:
 		case Basic_f32le:
 			return t_f64le;
+		case Basic_f16be:
 		case Basic_f32be:
 			return t_f64be;
 
diff --git a/tests/benchmark/crypto/benchmark_crypto.odin b/tests/benchmark/crypto/benchmark_crypto.odin
index e90216ad6..b2ac4bca3 100644
--- a/tests/benchmark/crypto/benchmark_crypto.odin
+++ b/tests/benchmark/crypto/benchmark_crypto.odin
@@ -28,6 +28,32 @@ benchmark_crypto :: proc(t: ^testing.T) {
 		strings.builder_destroy(&str)
 	}
 
+	{
+		name := "AES256-CTR 64 bytes"
+		options := &time.Benchmark_Options {
+			rounds = 1_000,
+			bytes = 64,
+			setup = _setup_sized_buf,
+			bench = _benchmark_aes256_ctr,
+			teardown = _teardown_sized_buf,
+		}
+
+		err := time.benchmark(options, context.allocator)
+		testing.expect(t, err == nil, name)
+		benchmark_print(&str, name, options)
+
+		name = "AES256-CTR 1024 bytes"
+		options.bytes = 1024
+		err = time.benchmark(options, context.allocator)
+		testing.expect(t, err == nil, name)
+		benchmark_print(&str, name, options)
+
+		name = "AES256-CTR 65536 bytes"
+		options.bytes = 65536
+		err = time.benchmark(options, context.allocator)
+		testing.expect(t, err == nil, name)
+		benchmark_print(&str, name, options)
+	}
 	{
 		name := "ChaCha20 64 bytes"
 		options := &time.Benchmark_Options {
@@ -323,6 +349,36 @@ _benchmark_chacha20poly1305 :: proc(
 	return nil
 }
 
+@(private)
+_benchmark_aes256_ctr :: proc(
+	options: ^time.Benchmark_Options,
+	allocator := context.allocator,
+) -> (
+	err: time.Benchmark_Error,
+) {
+	buf := options.input
+	key := [aes.KEY_SIZE_256]byte {
+		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
+		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
+		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
+		0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef,
+	}
+	nonce := [aes.CTR_IV_SIZE]byte {
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	}
+
+	ctx: aes.Context_CTR = ---
+	aes.init_ctr(&ctx, key[:], nonce[:])
+
+	for _ in 0 ..= options.rounds {
+		aes.xor_bytes_ctr(&ctx, buf, buf)
+	}
+	options.count = options.rounds
+	options.processed = options.rounds * options.bytes
+	return nil
+}
+
 _benchmark_aes256_gcm :: proc(
 	options: ^time.Benchmark_Options,
 	allocator := context.allocator,
diff --git a/tests/core/crypto/test_core_crypto_aes.odin b/tests/core/crypto/test_core_crypto_aes.odin
index 4d4c06bdc..c2fa2835c 100644
--- a/tests/core/crypto/test_core_crypto_aes.odin
+++ b/tests/core/crypto/test_core_crypto_aes.odin
@@ -12,8 +12,6 @@ import "core:crypto/sha2"
 test_aes :: proc(t: ^testing.T) {
 	runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD()
 
-	log.info("Testing AES")
-
 	impls := make([dynamic]aes.Implementation, 0, 2)
 	defer delete(impls)
 	append(&impls, aes.Implementation.Portable)
@@ -29,7 +27,7 @@ test_aes :: proc(t: ^testing.T) {
 }
 
 test_aes_ecb :: proc(t: ^testing.T, impl: aes.Implementation) {
-	log.infof("Testing AES-ECB/%v", impl)
+	log.debugf("Testing AES-ECB/%v", impl)
 
 	test_vectors := []struct {
 		key: string,
@@ -136,7 +134,7 @@ test_aes_ecb :: proc(t: ^testing.T, impl: aes.Implementation) {
 }
 
 test_aes_ctr :: proc(t: ^testing.T, impl: aes.Implementation) {
-	log.infof("Testing AES-CTR/%v", impl)
+	log.debugf("Testing AES-CTR/%v", impl)
 
 	test_vectors := []struct {
 		key: string,
@@ -200,7 +198,7 @@ test_aes_ctr :: proc(t: ^testing.T, impl: aes.Implementation) {
 	ctx: aes.Context_CTR
 	key: [aes.KEY_SIZE_256]byte
 	nonce: [aes.CTR_IV_SIZE]byte
-	aes.init_ctr(&ctx, key[:], nonce[:])
+	aes.init_ctr(&ctx, key[:], nonce[:], impl)
 
 	h_ctx: sha2.Context_512
 	sha2.init_512_256(&h_ctx)
@@ -226,7 +224,7 @@ test_aes_ctr :: proc(t: ^testing.T, impl: aes.Implementation) {
 }
 
 test_aes_gcm :: proc(t: ^testing.T, impl: aes.Implementation) {
-	log.infof("Testing AES-GCM/%v", impl)
+	log.debugf("Testing AES-GCM/%v", impl)
 
 	// NIST did a reorg of their site, so the source of the test vectors
 	// is only available from an archive.  The commented out tests are
@@ -431,7 +429,7 @@ test_aes_gcm :: proc(t: ^testing.T, impl: aes.Implementation) {
 		testing.expectf(
 			t,
 			ok && dst_str == v.plaintext,
-			"AES-GCM/%v: Expected: (%s, true) for open(%s, %s, %s, %s, %s), but got (%s, %s) instead",
+			"AES-GCM/%v: Expected: (%s, true) for open(%s, %s, %s, %s, %s), but got (%s, %v) instead",
 			impl,
 			v.plaintext,
 			v.key,
diff --git a/tests/core/crypto/test_core_crypto_ecc25519.odin b/tests/core/crypto/test_core_crypto_ecc25519.odin
index baf4a1a38..fec4fa38e 100644
--- a/tests/core/crypto/test_core_crypto_ecc25519.odin
+++ b/tests/core/crypto/test_core_crypto_ecc25519.odin
@@ -58,9 +58,9 @@ test_sqrt_ratio_m1 :: proc(t: ^testing.T) {
 		v_bytes, _ := hex.decode(transmute([]byte)(v.v), context.temp_allocator)
 		r_bytes, _ := hex.decode(transmute([]byte)(v.r), context.temp_allocator)
 
-		u_ := transmute(^[32]byte)(raw_data(u_bytes))
-		v_ := transmute(^[32]byte)(raw_data(v_bytes))
-		r_ := transmute(^[32]byte)(raw_data(r_bytes))
+		u_ := (^[32]byte)(raw_data(u_bytes))
+		v_ := (^[32]byte)(raw_data(v_bytes))
+		r_ := (^[32]byte)(raw_data(r_bytes))
 
 		u, vee, r: field.Tight_Field_Element
 		field.fe_from_bytes(&u, u_)
diff --git a/tests/core/crypto/test_core_crypto_kdf.odin b/tests/core/crypto/test_core_crypto_kdf.odin
index 247529e65..c15dc2206 100644
--- a/tests/core/crypto/test_core_crypto_kdf.odin
+++ b/tests/core/crypto/test_core_crypto_kdf.odin
@@ -161,7 +161,7 @@ test_pbkdf2 :: proc(t: ^testing.T) {
 		testing.expectf(
 			t,
 			dst_str == v.dk,
-			"HMAC-%s: Expected: %s for input of (%s, %s, %d), but got %s instead",
+			"PBKDF2-%s: Expected: %s for input of (%s, %s, %d), but got %s instead",
 			algo_name,
 			v.dk,
 			v.password,
diff --git a/tests/core/encoding/ini/test_core_ini.odin b/tests/core/encoding/ini/test_core_ini.odin
new file mode 100644
index 000000000..6e6c8152e
--- /dev/null
+++ b/tests/core/encoding/ini/test_core_ini.odin
@@ -0,0 +1,120 @@
+package test_core_ini
+
+import "base:runtime"
+import "core:encoding/ini"
+import "core:mem/virtual"
+import "core:strings"
+import "core:testing"
+
+@test
+parse_ini :: proc(t: ^testing.T) {
+	ini_data := `
+		[LOG]
+		level = "devel"
+		file = "/var/log/testing.log"
+
+		[USER]
+		first_name = "John"
+		surname = "Smith"
+	`
+
+	m, err := ini.load_map_from_string(ini_data, context.allocator)
+	defer ini.delete_map(m)
+
+	testing.expectf(
+		t,
+		strings.contains(m["LOG"]["level"], "devel"),
+		"Expected m[\"LOG\"][\"level\"] to be equal to 'devel' instead got %v",
+		m["LOG"]["level"],
+	)
+	testing.expectf(
+		t,
+		strings.contains(m["LOG"]["file"], "/var/log/testing.log"),
+		"Expected m[\"LOG\"][\"file\"] to be equal to '/var/log/testing.log' instead got %v",
+		m["LOG"]["file"],
+	)
+	testing.expectf(
+		t,
+		strings.contains(m["USER"]["first_name"], "John"),
+		"Expected m[\"USER\"][\"first_name\"] to be equal to 'John' instead got %v",
+		m["USER"]["first_name"],
+	)
+	testing.expectf(
+		t,
+		strings.contains(m["USER"]["surname"], "Smith"),
+		"Expected m[\"USER\"][\"surname\"] to be equal to 'Smith' instead got %v",
+		m["USER"]["surname"],
+	)
+
+	testing.expectf(t, err == nil, "Expected `ini.load_map_from_string` to return a nil error, got %v", err)
+}
+
+@test
+ini_to_string :: proc(t: ^testing.T) {
+	m := ini.Map{
+		"LEVEL" = {
+			"LOG" = "debug",
+		},
+	}
+
+	str := ini.save_map_to_string(m, context.allocator)
+	defer delete(str)
+	delete(m["LEVEL"])
+	delete(m)
+
+	testing.expectf(
+		t,
+		strings.contains(str, "[LEVEL]LOG = debug"),
+		"Expected `ini.save_map_to_string` to return a string equal to \"[LEVEL]LOG = debug\", got %v",
+		str,
+	)
+}
+
+@test
+ini_iterator :: proc(t: ^testing.T) {
+	ini_data := `
+		[LOG]
+		level = "devel"
+		file = "/var/log/testing.log"
+
+		[USER]
+		first_name = "John"
+		surname = "Smith"
+	`
+
+	i := 0
+	iterator := ini.iterator_from_string(ini_data)
+	for key, value in ini.iterate(&iterator) {
+		if strings.contains(key, "level") {
+			testing.expectf(
+				t,
+				strings.contains(value, "devel"),
+				"Expected 'level' to be equal to 'devel' instead got '%v'",
+				value,
+			)
+		} else if strings.contains(key, "file") {
+			testing.expectf(
+				t,
+				strings.contains(value, "/var/log/testing.log"),
+				"Expected 'file' to be equal to '/var/log/testing.log' instead got '%v'",
+				value,
+			)
+		} else if strings.contains(key, "first_name") {
+			testing.expectf(
+				t,
+				strings.contains(value, "John"),
+				"Expected 'first_name' to be equal to 'John' instead got '%v'",
+				value,
+			)
+		} else if strings.contains(key, "surname") {
+			testing.expectf(
+				t,
+				strings.contains(value, "Smith"),
+				"Expected 'surname' to be equal to 'Smith' instead got '%v'",
+				value,
+			)
+		}
+		i += 1
+		}
+	testing.expectf(t, i == 4, "Expected to loop 4 times, only looped %v times", i)
+}
diff --git a/tests/core/encoding/json/test_core_json.odin b/tests/core/encoding/json/test_core_json.odin
index 92c050952..42ac9ce0f 100644
--- a/tests/core/encoding/json/test_core_json.odin
+++ b/tests/core/encoding/json/test_core_json.odin
@@ -3,6 +3,7 @@ package test_core_json
 import "core:encoding/json"
 import "core:testing"
 import "core:mem/virtual"
+import "base:runtime"
 
 @test
 parse_json :: proc(t: ^testing.T) {
@@ -348,6 +349,24 @@ unmarshal_json :: proc(t: ^testing.T) {
 	}
 }
 
+@test
+unmarshal_empty_struct :: proc(t: ^testing.T) {
+	TestStruct :: struct {}
+	test := make(map[string]TestStruct)
+	input: = `{
+		"test_1": {},
+		"test_2": {}
+	}`
+	err := json.unmarshal(transmute([]u8)input, &test)
+	defer {
+		for k in test {
+			delete(k)
+		}
+		delete(test)
+	}
+	testing.expect(t, err == nil, "Expected empty struct to unmarshal without error")
+}
+
 @test
 surrogate :: proc(t: ^testing.T) {
 	input := `+ + * 😃 - /`
@@ -368,4 +387,60 @@ utf8_string_of_multibyte_characters :: proc(t: ^testing.T) {
 	val, err := json.parse_string(`"🐛✅"`)
 	defer json.destroy_value(val)
 	testing.expectf(t, err == nil, "Expected `json.parse` to return nil, got %v", err)
+}
+
+@test
+struct_with_ignore_tags :: proc(t: ^testing.T) {
+	My_Struct :: struct {
+		a: string `json:"-"`,
+	}
+
+	my_struct := My_Struct{
+		a = "test",
+	}
+
+	my_struct_marshaled, marshal_err := json.marshal(my_struct)
+	defer delete(my_struct_marshaled)
+
+	testing.expectf(t, marshal_err == nil, "Expected `json.marshal` to return nil error, got %v", marshal_err)
+
+	my_struct_json := transmute(string)my_struct_marshaled
+	expected_json := `{}`
+
+	testing.expectf(t, expected_json == my_struct_json, "Expected `json.marshal` to return %s, got %s", expected_json, my_struct_json)
+}
+
+@test
+map_with_integer_keys :: proc(t: ^testing.T) {
+	my_map := make(map[i32]string)
+	defer delete_map(my_map)
+
+	my_map[-1] = "a"
+	my_map[0] = "b"
+	my_map[42] = "c"
+	my_map[99999999] = "d"
+
+	marshaled_data, marshal_err := json.marshal(my_map)
+	defer delete(marshaled_data)
+	
+	testing.expectf(t, marshal_err == nil, "Expected `json.marshal` to return nil error, got %v", marshal_err)
+
+	my_map2 := make(map[i32]string)
+	defer delete_map(my_map2)
+
+	unmarshal_err := json.unmarshal(marshaled_data, &my_map2)
+	defer for key, item in my_map2 {
+		runtime.delete_string(item)
+	}
+	testing.expectf(t, unmarshal_err == nil, "Expected `json.unmarshal` to return nil, got %v", unmarshal_err)
+
+	testing.expectf(t, len(my_map) == len(my_map2), "Expected %v map items to have been unmarshaled, got %v", len(my_map), len(my_map2))
+
+	for key, item in my_map {
+		testing.expectf(t, key in my_map2, "Expected key %v to be present in unmarshaled map", key)
+		
+		if key in my_map2 {
+			testing.expectf(t, runtime.string_eq(item, my_map2[key]), "Expected value %s to be present in unmarshaled map", key)
+		}
+	}
 }
\ No newline at end of file
diff --git a/tests/core/fmt/test_core_fmt.odin b/tests/core/fmt/test_core_fmt.odin
index 49142e24d..3a1eb37e7 100644
--- a/tests/core/fmt/test_core_fmt.odin
+++ b/tests/core/fmt/test_core_fmt.odin
@@ -372,6 +372,22 @@ test_odin_value_export :: proc(t: ^testing.T) {
 	}
 }
 
+@(test)
+leaking_struct_tag :: proc(t: ^testing.T) {
+	My_Struct :: struct {
+		names:      [^]string `fmt:"v,name_count"`,
+		name_count: int,
+	}
+
+	name := "hello?"
+	foo := My_Struct {
+		names = &name,
+		name_count = 1,
+	}
+
+	check(t, "My_Struct{names = [\"hello?\"], name_count = 1}", "%v", foo)
+}
+
 @(private)
 check :: proc(t: ^testing.T, exp: string, format: string, args: ..any, loc := #caller_location) {
 	got := fmt.tprintf(format, ..args)
diff --git a/tests/vendor/all.odin b/tests/vendor/all.odin
index 1ce56e786..1abbc5d7f 100644
--- a/tests/vendor/all.odin
+++ b/tests/vendor/all.odin
@@ -1,3 +1,4 @@
 package tests_vendor
 
 @(require) import "glfw"
+@(require) import "lua/5.4"
\ No newline at end of file
diff --git a/tests/vendor/lua/5.4/factorial.lua b/tests/vendor/lua/5.4/factorial.lua
new file mode 100644
index 000000000..00cfb20f7
--- /dev/null
+++ b/tests/vendor/lua/5.4/factorial.lua
@@ -0,0 +1,10 @@
+-- defines a factorial function
+function fact (n)
+  if n == 0 then
+    return 1
+  else
+    return n * fact(n-1)
+  end
+end
+    
+return fact(10)
\ No newline at end of file
diff --git a/tests/vendor/lua/5.4/test_vendor_lua.5.4.odin b/tests/vendor/lua/5.4/test_vendor_lua.5.4.odin
new file mode 100644
index 000000000..e331200ea
--- /dev/null
+++ b/tests/vendor/lua/5.4/test_vendor_lua.5.4.odin
@@ -0,0 +1,71 @@
+//+build windows, linux, darwin
+package test_vendor_lua_54
+
+import "core:testing"
+import "core:c"
+import lua "vendor:lua/5.4"
+import "base:runtime"
+
+@(test)
+// Test context.allocator and returning a string
+return_string_with_context_based_allocator :: proc(t: ^testing.T) {
+	_context := context
+
+	state: ^lua.State
+	state = lua.newstate(lua_context_allocator, &_context)
+	defer lua.close(state)
+
+	lua.L_dostring(state, "return 'somestring'")
+	str := lua.tostring(state, -1)
+
+	testing.expectf(
+		t, str == "somestring", "Expected Lua to return \"somestring\"",
+	)
+}
+
+@(test)
+// Test lua.dofile and returning an integer
+dofile_factorial :: proc(t: ^testing.T) {
+	state := lua.L_newstate()
+	defer lua.close(state)
+
+	FACT_10 :: 3628800
+
+	res := lua.L_dofile(state, #directory + "/factorial.lua")
+	testing.expectf(t, lua.Status(res) == .OK, "Expected L_dofile to return OKAY")
+
+	fact := lua.L_checkinteger(state, -1)
+
+	testing.expectf(t, fact == FACT_10, "Expected factorial(10) to return %v, got %v", FACT_10, fact)
+}
+
+@(test)
+// Test that our bindings didn't get out of sync with the API version
+verify_lua_api_version :: proc(t: ^testing.T) {
+	state := lua.L_newstate()
+	defer lua.close(state)
+
+	version := int(lua.version(state))
+
+	testing.expectf(t, version == lua.VERSION_NUM, "Expected lua.version to return %v, got %v", lua.VERSION_NUM, version)
+}
+
+// Simple context.allocator-based callback for Lua. Use `lua.newstate` to pass the context as user data.
+lua_context_allocator :: proc "c" (ud: rawptr, ptr: rawptr, osize, nsize: c.size_t) -> (buf: rawptr) {
+	old_size := int(osize)
+	new_size := int(nsize)
+	context = (^runtime.Context)(ud)^
+
+	if ptr == nil {
+		data, err := runtime.mem_alloc(new_size)
+		return raw_data(data) if err == .None else nil
+	} else {
+		if nsize > 0 {
+			data, err := runtime.mem_resize(ptr, old_size, new_size)
+			return raw_data(data) if err == .None else nil
+		} else {
+			runtime.mem_free(ptr)
+			return
+		}
+	}
+}
\ No newline at end of file
diff --git a/vendor/compress/lz4/lib/liblz4_static.lib b/vendor/compress/lz4/lib/liblz4_static.lib
new file mode 100644
index 000000000..b60a626a2
Binary files /dev/null and b/vendor/compress/lz4/lib/liblz4_static.lib differ
diff --git a/vendor/compress/lz4/lz4.odin b/vendor/compress/lz4/lz4.odin
new file mode 100644
index 000000000..310248d56
--- /dev/null
+++ b/vendor/compress/lz4/lz4.odin
@@ -0,0 +1,542 @@
+package vendor_compress_lz4
+
+when ODIN_OS == .Windows {
+	@(extra_linker_flags="/NODEFAULTLIB:libcmt")
+	foreign import lib "lib/liblz4_static.lib"
+}
+
+import "core:c"
+
+VERSION_MAJOR   ::  1    /* for breaking interface changes  */
+VERSION_MINOR   :: 10    /* for new (non-breaking) interface capabilities */
+VERSION_RELEASE ::  0    /* for tweaks, bug-fixes, or development */
+
+VERSION_NUMBER  :: VERSION_MAJOR *100*100 + VERSION_MINOR *100 + VERSION_RELEASE
+
+MEMORY_USAGE_MIN     :: 10
+MEMORY_USAGE_DEFAULT :: 14
+MEMORY_USAGE_MAX     :: 20
+
+MEMORY_USAGE :: MEMORY_USAGE_DEFAULT
+
+MAX_INPUT_SIZE :: 0x7E000000   /* 2_113_929_216 bytes */
+
+
+COMPRESSBOUND :: #force_inline proc "c" (isize: c.int) -> c.int {
+	return u32(isize) > MAX_INPUT_SIZE ? 0 : isize + (isize/255) + 16
+}
+
+
+DECODER_RING_BUFFER_SIZE :: #force_inline proc "c" (maxBlockSize: c.int) -> c.int {
+	return 65536 + 14 + maxBlockSize  /* for static allocation; maxBlockSize presumed valid */
+}
+
+@(default_calling_convention="c", link_prefix="LZ4_")
+foreign lib {
+	versionNumber :: proc() -> c.int ---   /**< library version number; useful to check dll version; requires v1.3.0+ */
+	versionString :: proc() -> cstring --- /**< library version string; useful to check dll version; requires v1.7.5+ */
+
+	/*! LZ4_compress_default() :
+	 *  Compresses 'srcSize' bytes from buffer 'src'
+	 *  into already allocated 'dst' buffer of size 'dstCapacity'.
+	 *  Compression is guaranteed to succeed if 'dstCapacity' >= LZ4_compressBound(srcSize).
+	 *  It also runs faster, so it's a recommended setting.
+	 *  If the function cannot compress 'src' into a more limited 'dst' budget,
+	 *  compression stops *immediately*, and the function result is zero.
+	 *  In which case, 'dst' content is undefined (invalid).
+	 *      srcSize : max supported value is LZ4_MAX_INPUT_SIZE.
+	 *      dstCapacity : size of buffer 'dst' (which must be already allocated)
+	 *     @return  : the number of bytes written into buffer 'dst' (necessarily <= dstCapacity)
+	 *                or 0 if compression fails
+	 * Note : This function is protected against buffer overflow scenarios (never writes outside 'dst' buffer, nor read outside 'source' buffer).
+	 */
+	compress_default :: proc(src, dst: [^]byte, srcSize, dstCapacity: c.int) -> c.int ---
+
+	/*! LZ4_decompress_safe() :
+	 * @compressedSize : is the exact complete size of the compressed block.
+	 * @dstCapacity : is the size of destination buffer (which must be already allocated),
+	 *                presumed an upper bound of decompressed size.
+	 * @return : the number of bytes decompressed into destination buffer (necessarily <= dstCapacity)
+	 *           If destination buffer is not large enough, decoding will stop and output an error code (negative value).
+	 *           If the source stream is detected malformed, the function will stop decoding and return a negative result.
+	 * Note 1 : This function is protected against malicious data packets :
+	 *          it will never writes outside 'dst' buffer, nor read outside 'source' buffer,
+	 *          even if the compressed block is maliciously modified to order the decoder to do these actions.
+	 *          In such case, the decoder stops immediately, and considers the compressed block malformed.
+	 * Note 2 : compressedSize and dstCapacity must be provided to the function, the compressed block does not contain them.
+	 *          The implementation is free to send / store / derive this information in whichever way is most beneficial.
+	 *          If there is a need for a different format which bundles together both compressed data and its metadata, consider looking at lz4frame.h instead.
+	 */
+	decompress_safe :: proc(src, dst: [^]byte, compressedSize, dstCapacity: c.int) -> c.int ---
+
+
+	/*! LZ4_compressBound() :
+	    Provides the maximum size that LZ4 compression may output in a "worst case" scenario (input data not compressible)
+	    This function is primarily useful for memory allocation purposes (destination buffer size).
+	    Macro LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack memory allocation for example).
+	    Note that LZ4_compress_default() compresses faster when dstCapacity is >= LZ4_compressBound(srcSize)
+	        inputSize  : max supported value is LZ4_MAX_INPUT_SIZE
+	        return : maximum output size in a "worst case" scenario
+	              or 0, if input size is incorrect (too large or negative)
+	*/
+	compressBound :: proc(inputSize: c.int) -> c.int ---
+
+	/*! LZ4_compress_fast() :
+	    Same as LZ4_compress_default(), but allows selection of "acceleration" factor.
+	    The larger the acceleration value, the faster the algorithm, but also the lesser the compression.
+	    It's a trade-off. It can be fine tuned, with each successive value providing roughly +~3% to speed.
+	    An acceleration value of "1" is the same as regular LZ4_compress_default()
+	    Values <= 0 will be replaced by LZ4_ACCELERATION_DEFAULT (currently == 1, see lz4.c).
+	    Values > LZ4_ACCELERATION_MAX will be replaced by LZ4_ACCELERATION_MAX (currently == 65537, see lz4.c).
+	*/
+	compress_fast :: proc(src, dst: [^]byte, srcSize, dstCapacity: c.int, acceleration: c.int) -> c.int ---
+
+
+	/*! LZ4_compress_fast_extState() :
+	 *  Same as LZ4_compress_fast(), using an externally allocated memory space for its state.
+	 *  Use LZ4_sizeofState() to know how much memory must be allocated,
+	 *  and allocate it on 8-bytes boundaries (using `malloc()` typically).
+	 *  Then, provide this buffer as `void* state` to compression function.
+	 */
+	sizeofState :: proc() -> c.int ---
+	compress_fast_extState :: proc (state: rawptr, src, dst: [^]byte, srcSize, dstCapacity: c.int, acceleration: c.int) -> c.int ---
+
+
+	/*! LZ4_compress_destSize() :
+	 *  Reverse the logic : compresses as much data as possible from 'src' buffer
+	 *  into already allocated buffer 'dst', of size >= 'dstCapacity'.
+	 *  This function either compresses the entire 'src' content into 'dst' if it's large enough,
+	 *  or fill 'dst' buffer completely with as much data as possible from 'src'.
+	 *  note: acceleration parameter is fixed to "default".
+	 *
+	 * *srcSizePtr : in+out parameter. Initially contains size of input.
+	 *               Will be modified to indicate how many bytes where read from 'src' to fill 'dst'.
+	 *               New value is necessarily <= input value.
+	 * @return : Nb bytes written into 'dst' (necessarily <= dstCapacity)
+	 *           or 0 if compression fails.
+	 *
+	 * Note : from v1.8.2 to v1.9.1, this function had a bug (fixed in v1.9.2+):
+	 *        the produced compressed content could, in specific circumstances,
+	 *        require to be decompressed into a destination buffer larger
+	 *        by at least 1 byte than the content to decompress.
+	 *        If an application uses `LZ4_compress_destSize()`,
+	 *        it's highly recommended to update liblz4 to v1.9.2 or better.
+	 *        If this can't be done or ensured,
+	 *        the receiving decompression function should provide
+	 *        a dstCapacity which is > decompressedSize, by at least 1 byte.
+	 *        See https://github.com/lz4/lz4/issues/859 for details
+	 */
+	compress_destSize :: proc(src, dst: [^]byte, srcSizePtr: ^c.int, targetDstSize: c.int) -> c.int ---
+
+
+	/*! LZ4_decompress_safe_partial() :
+	 *  Decompress an LZ4 compressed block, of size 'srcSize' at position 'src',
+	 *  into destination buffer 'dst' of size 'dstCapacity'.
+	 *  Up to 'targetOutputSize' bytes will be decoded.
+	 *  The function stops decoding on reaching this objective.
+	 *  This can be useful to boost performance
+	 *  whenever only the beginning of a block is required.
+	 *
+	 * @return : the number of bytes decoded in `dst` (necessarily <= targetOutputSize)
+	 *           If source stream is detected malformed, function returns a negative result.
+	 *
+	 *  Note 1 : @return can be < targetOutputSize, if compressed block contains less data.
+	 *
+	 *  Note 2 : targetOutputSize must be <= dstCapacity
+	 *
+	 *  Note 3 : this function effectively stops decoding on reaching targetOutputSize,
+	 *           so dstCapacity is kind of redundant.
+	 *           This is because in older versions of this function,
+	 *           decoding operation would still write complete sequences.
+	 *           Therefore, there was no guarantee that it would stop writing at exactly targetOutputSize,
+	 *           it could write more bytes, though only up to dstCapacity.
+	 *           Some "margin" used to be required for this operation to work properly.
+	 *           Thankfully, this is no longer necessary.
+	 *           The function nonetheless keeps the same signature, in an effort to preserve API compatibility.
+	 *
+	 *  Note 4 : If srcSize is the exact size of the block,
+	 *           then targetOutputSize can be any value,
+	 *           including larger than the block's decompressed size.
+	 *           The function will, at most, generate block's decompressed size.
+	 *
+	 *  Note 5 : If srcSize is _larger_ than block's compressed size,
+	 *           then targetOutputSize **MUST** be <= block's decompressed size.
+	 *           Otherwise, *silent corruption will occur*.
+	 */
+	decompress_safe_partial :: proc (src, dst: [^]byte, srcSize, targetOutputSize, dstCapacity: c.int) -> c.int ---
+
+
+	createStream :: proc() -> ^stream_t ---
+	freeStream   :: proc(streamPtr: ^stream_t) -> c.int ---
+
+	/*! LZ4_resetStream_fast() : v1.9.0+
+	 *  Use this to prepare an LZ4_stream_t for a new chain of dependent blocks
+	 *  (e.g., LZ4_compress_fast_continue()).
+	 *
+	 *  An LZ4_stream_t must be initialized once before usage.
+	 *  This is automatically done when created by LZ4_createStream().
+	 *  However, should the LZ4_stream_t be simply declared on stack (for example),
+	 *  it's necessary to initialize it first, using LZ4_initStream().
+	 *
+	 *  After init, start any new stream with LZ4_resetStream_fast().
+	 *  A same LZ4_stream_t can be re-used multiple times consecutively
+	 *  and compress multiple streams,
+	 *  provided that it starts each new stream with LZ4_resetStream_fast().
+	 *
+	 *  LZ4_resetStream_fast() is much faster than LZ4_initStream(),
+	 *  but is not compatible with memory regions containing garbage data.
+	 *
+	 *  Note: it's only useful to call LZ4_resetStream_fast()
+	 *        in the context of streaming compression.
+	 *        The *extState* functions perform their own resets.
+	 *        Invoking LZ4_resetStream_fast() before is redundant, and even counterproductive.
+	 */
+	resetStream_fast :: proc(streamPtr: ^stream_t) ---
+
+
+	/*! LZ4_loadDict() :
+	 *  Use this function to reference a static dictionary into LZ4_stream_t.
+	 *  The dictionary must remain available during compression.
+	 *  LZ4_loadDict() triggers a reset, so any previous data will be forgotten.
+	 *  The same dictionary will have to be loaded on decompression side for successful decoding.
+	 *  Dictionary are useful for better compression of small data (KB range).
+	 *  While LZ4 itself accepts any input as dictionary, dictionary efficiency is also a topic.
+	 *  When in doubt, employ the Zstandard's Dictionary Builder.
+	 *  Loading a size of 0 is allowed, and is the same as reset.
+	 * @return : loaded dictionary size, in bytes (note: only the last 64 KB are loaded)
+	 */
+	loadDict :: proc(streamPtr: ^stream_t, dictionary: [^]byte, dictSize: c.int) -> c.int ---
+
+	/*! LZ4_loadDictSlow() : v1.10.0+
+	 *  Same as LZ4_loadDict(),
+	 *  but uses a bit more cpu to reference the dictionary content more thoroughly.
+	 *  This is expected to slightly improve compression ratio.
+	 *  The extra-cpu cost is likely worth it if the dictionary is re-used across multiple sessions.
+	 * @return : loaded dictionary size, in bytes (note: only the last 64 KB are loaded)
+	 */
+	loadDictSlow :: proc(streamPtr: ^stream_t, dictionary: [^]byte, dictSize: c.int) -> c.int ---
+
+	/*! LZ4_attach_dictionary() : stable since v1.10.0
+	 *
+	 *  This allows efficient re-use of a static dictionary multiple times.
+	 *
+	 *  Rather than re-loading the dictionary buffer into a working context before
+	 *  each compression, or copying a pre-loaded dictionary's LZ4_stream_t into a
+	 *  working LZ4_stream_t, this function introduces a no-copy setup mechanism,
+	 *  in which the working stream references @dictionaryStream in-place.
+	 *
+	 *  Several assumptions are made about the state of @dictionaryStream.
+	 *  Currently, only states which have been prepared by LZ4_loadDict() or
+	 *  LZ4_loadDictSlow() should be expected to work.
+	 *
+	 *  Alternatively, the provided @dictionaryStream may be NULL,
+	 *  in which case any existing dictionary stream is unset.
+	 *
+	 *  If a dictionary is provided, it replaces any pre-existing stream history.
+	 *  The dictionary contents are the only history that can be referenced and
+	 *  logically immediately precede the data compressed in the first subsequent
+	 *  compression call.
+	 *
+	 *  The dictionary will only remain attached to the working stream through the
+	 *  first compression call, at the end of which it is cleared.
+	 * @dictionaryStream stream (and source buffer) must remain in-place / accessible / unchanged
+	 *  through the completion of the compression session.
+	 *
+	 *  Note: there is no equivalent LZ4_attach_*() method on the decompression side
+	 *  because there is no initialization cost, hence no need to share the cost across multiple sessions.
+	 *  To decompress LZ4 blocks using dictionary, attached or not,
+	 *  just employ the regular LZ4_setStreamDecode() for streaming,
+	 *  or the stateless LZ4_decompress_safe_usingDict() for one-shot decompression.
+	 */
+	attach_dictionary :: proc(workingStream, dictionaryStream: ^stream_t) ---
+
+	/*! LZ4_compress_fast_continue() :
+	 *  Compress 'src' content using data from previously compressed blocks, for better compression ratio.
+	 * 'dst' buffer must be already allocated.
+	 *  If dstCapacity >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster.
+	 *
+	 * @return : size of compressed block
+	 *           or 0 if there is an error (typically, cannot fit into 'dst').
+	 *
+	 *  Note 1 : Each invocation to LZ4_compress_fast_continue() generates a new block.
+	 *           Each block has precise boundaries.
+	 *           Each block must be decompressed separately, calling LZ4_decompress_*() with relevant metadata.
+	 *           It's not possible to append blocks together and expect a single invocation of LZ4_decompress_*() to decompress them together.
+	 *
+	 *  Note 2 : The previous 64KB of source data is __assumed__ to remain present, unmodified, at same address in memory !
+	 *
+	 *  Note 3 : When input is structured as a double-buffer, each buffer can have any size, including < 64 KB.
+	 *           Make sure that buffers are separated, by at least one byte.
+	 *           This construction ensures that each block only depends on previous block.
+	 *
+	 *  Note 4 : If input buffer is a ring-buffer, it can have any size, including < 64 KB.
+	 *
+	 *  Note 5 : After an error, the stream status is undefined (invalid), it can only be reset or freed.
+	 */
+	compress_fast_continue :: proc(streamPtr: ^stream_t, src, dst: [^]byte, srcSize, dstCapacity: c.int, acceleration: c.int) -> c.int ---
+
+	/*! LZ4_saveDict() :
+	 *  If last 64KB data cannot be guaranteed to remain available at its current memory location,
+	 *  save it into a safer place (char* safeBuffer).
+	 *  This is schematically equivalent to a memcpy() followed by LZ4_loadDict(),
+	 *  but is much faster, because LZ4_saveDict() doesn't need to rebuild tables.
+	 * @return : saved dictionary size in bytes (necessarily <= maxDictSize), or 0 if error.
+	 */
+	saveDict :: proc(streamPtr: ^stream_t, safeBuffer: [^]byte, maxDictSize: c.int) -> c.int ---
+
+
+	createStreamDecode :: proc() -> ^streamDecode_t ---
+	freeStreamDecode   :: proc(LZ4_stream: ^streamDecode_t) -> c.int ---
+
+	/*! LZ4_setStreamDecode() :
+	 *  An LZ4_streamDecode_t context can be allocated once and re-used multiple times.
+	 *  Use this function to start decompression of a new stream of blocks.
+	 *  A dictionary can optionally be set. Use NULL or size 0 for a reset order.
+	 *  Dictionary is presumed stable : it must remain accessible and unmodified during next decompression.
+	 * @return : 1 if OK, 0 if error
+	 */
+	setStreamDecode :: proc(LZ4_streamDecode: ^streamDecode_t, dictionary: [^]byte, dictSize: c.int) -> c.int ---
+
+	/*! LZ4_decoderRingBufferSize() : v1.8.2+
+	 *  Note : in a ring buffer scenario (optional),
+	 *  blocks are presumed decompressed next to each other
+	 *  up to the moment there is not enough remaining space for next block (remainingSize < maxBlockSize),
+	 *  at which stage it resumes from beginning of ring buffer.
+	 *  When setting such a ring buffer for streaming decompression,
+	 *  provides the minimum size of this ring buffer
+	 *  to be compatible with any source respecting maxBlockSize condition.
+	 * @return : minimum ring buffer size,
+	 *           or 0 if there is an error (invalid maxBlockSize).
+	 */
+	decoderRingBufferSize :: proc(maxBlockSize: c.int) -> c.int ---
+
+	/*! LZ4_decompress_safe_continue() :
+	 *  This decoding function allows decompression of consecutive blocks in "streaming" mode.
+	 *  The difference with the usual independent blocks is that
+	 *  new blocks are allowed to find references into former blocks.
+	 *  A block is an unsplittable entity, and must be presented entirely to the decompression function.
+	 *  LZ4_decompress_safe_continue() only accepts one block at a time.
+	 *  It's modeled after `LZ4_decompress_safe()` and behaves similarly.
+	 *
+	 * @LZ4_streamDecode : decompression state, tracking the position in memory of past data
+	 * @compressedSize : exact complete size of one compressed block.
+	 * @dstCapacity : size of destination buffer (which must be already allocated),
+	 *                must be an upper bound of decompressed size.
+	 * @return : number of bytes decompressed into destination buffer (necessarily <= dstCapacity)
+	 *           If destination buffer is not large enough, decoding will stop and output an error code (negative value).
+	 *           If the source stream is detected malformed, the function will stop decoding and return a negative result.
+	 *
+	 *  The last 64KB of previously decoded data *must* remain available and unmodified
+	 *  at the memory position where they were previously decoded.
+	 *  If less than 64KB of data has been decoded, all the data must be present.
+	 *
+	 *  Special : if decompression side sets a ring buffer, it must respect one of the following conditions :
+	 *  - Decompression buffer size is _at least_ LZ4_decoderRingBufferSize(maxBlockSize).
+	 *    maxBlockSize is the maximum size of any single block. It can have any value > 16 bytes.
+	 *    In which case, encoding and decoding buffers do not need to be synchronized.
+	 *    Actually, data can be produced by any source compliant with LZ4 format specification, and respecting maxBlockSize.
+	 *  - Synchronized mode :
+	 *    Decompression buffer size is _exactly_ the same as compression buffer size,
+	 *    and follows exactly same update rule (block boundaries at same positions),
+	 *    and decoding function is provided with exact decompressed size of each block (exception for last block of the stream),
+	 *    _then_ decoding & encoding ring buffer can have any size, including small ones ( < 64 KB).
+	 *  - Decompression buffer is larger than encoding buffer, by a minimum of maxBlockSize more bytes.
+	 *    In which case, encoding and decoding buffers do not need to be synchronized,
+	 *    and encoding ring buffer can have any size, including small ones ( < 64 KB).
+	 *
+	 *  Whenever these conditions are not possible,
+	 *  save the last 64KB of decoded data into a safe buffer where it can't be modified during decompression,
+	 *  then indicate where this data is saved using LZ4_setStreamDecode(), before decompressing next block.
+	*/
+	decompress_safe_continue :: proc(LZ4_streamDecode: ^streamDecode_t, src, dst: [^]byte, srcSize, dstCapacity: c.int) -> c.int ---
+
+
+	/*! LZ4_decompress_safe_usingDict() :
+	 *  Works the same as
+	 *  a combination of LZ4_setStreamDecode() followed by LZ4_decompress_safe_continue()
+	 *  However, it's stateless: it doesn't need any LZ4_streamDecode_t state.
+	 *  Dictionary is presumed stable : it must remain accessible and unmodified during decompression.
+	 *  Performance tip : Decompression speed can be substantially increased
+	 *                    when dst == dictStart + dictSize.
+	 */
+	decompress_safe_usingDict :: proc(src, dst: [^]byte, srcSize, dstCapacity: c.int, dictStart: [^]byte, dictSize: c.int) -> c.int ---
+
+	/*! LZ4_decompress_safe_partial_usingDict() :
+	 *  Behaves the same as LZ4_decompress_safe_partial()
+	 *  with the added ability to specify a memory segment for past data.
+	 *  Performance tip : Decompression speed can be substantially increased
+	 *                    when dst == dictStart + dictSize.
+	 */
+	decompress_safe_partial_usingDict :: proc(src, dst: [^]byte, compressedSize, targetOutputSize, maxOutputSize: c.int, dictStart: [^]byte, dictSize: c.int) -> c.int ---
+
+}
+
+
+STREAM_MINSIZE :: (1 << MEMORY_USAGE) + 32  /* static size, for inter-version compatibility */
+
+stream_t :: struct #raw_union {
+	minStateSize:      [STREAM_MINSIZE]byte,
+	internal_donotuse: stream_t_internal,
+}
+
+
+HASHLOG       :: MEMORY_USAGE-2
+HASHTABLESIZE :: 1 << MEMORY_USAGE
+HASH_SIZE_U32 :: 1 << HASHLOG      /* required as macro for static allocation */
+
+stream_t_internal :: struct {
+	hashTable:     [HASH_SIZE_U32]u32,
+	dictionary:    [^]byte,
+	dictCtx:       ^stream_t_internal,
+	currentOffset: u32,
+	tableType:     u32,
+	dictSize:      u32,
+	/* Implicit padding to ensure structure is aligned */
+}
+
+
+STREAMDECODE_MINSIZE :: 32
+streamDecode_t :: struct #raw_union {
+	minStateSize:      [STREAMDECODE_MINSIZE]byte,
+	internal_donotuse: streamDecode_t_internal,
+}
+
+streamDecode_t_internal :: struct {
+	externalDict: [^]byte,
+	prefixEnd:    [^]byte,
+	extDictSize:  c.size_t,
+	prefixSize:   c.size_t,
+}
+
+
+
+///////////////////
+// lz4hc
+
+CLEVEL_MIN     ::  2
+CLEVEL_DEFAULT ::  9
+CLEVEL_OPT_MIN :: 10
+CLEVEL_MAX     :: 12
+
+
+@(default_calling_convention="c", link_prefix="LZ4_")
+foreign lib {
+	/*! LZ4_compress_HC() :
+	 *  Compress data from `src` into `dst`, using the powerful but slower "HC" algorithm.
+	 * `dst` must be already allocated.
+	 *  Compression is guaranteed to succeed if `dstCapacity >= LZ4_compressBound(srcSize)` (see "lz4.h")
+	 *  Max supported `srcSize` value is LZ4_MAX_INPUT_SIZE (see "lz4.h")
+	 * `compressionLevel` : any value between 1 and LZ4HC_CLEVEL_MAX will work.
+	 *                      Values > LZ4HC_CLEVEL_MAX behave the same as LZ4HC_CLEVEL_MAX.
+	 * @return : the number of bytes written into 'dst'
+	 *           or 0 if compression fails.
+	 */
+	compress_HC :: proc(src, dst: [^]byte, srcSize, dstCapacity, compressionLevel: c.int) -> c.int ---
+
+
+	/*! LZ4_compress_HC_extStateHC() :
+	 *  Same as LZ4_compress_HC(), but using an externally allocated memory segment for `state`.
+	 * `state` size is provided by LZ4_sizeofStateHC().
+	 *  Memory segment must be aligned on 8-bytes boundaries (which a normal malloc() should do properly).
+	 */
+	sizeofStateHC :: proc() -> c.int ---
+	compress_HC_extStateHC :: proc(stateHC: rawptr, src, dst: [^]byte, srcSize, maxDstSize: c.int, compressionLevel: c.int) -> c.int ---
+
+
+	/*! LZ4_compress_HC_destSize() : v1.9.0+
+	 *  Will compress as much data as possible from `src`
+	 *  to fit into `targetDstSize` budget.
+	 *  Result is provided in 2 parts :
+	 * @return : the number of bytes written into 'dst' (necessarily <= targetDstSize)
+	 *           or 0 if compression fails.
+	 * `srcSizePtr` : on success, *srcSizePtr is updated to indicate how much bytes were read from `src`
+	 */
+	compress_HC_destSize :: proc(stateHC: rawptr, src, dst: [^]byte, srcSizePtr: ^c.int, targetDstSize: c.int, compressionLevel: c.int) -> c.int ---
+
+	/*! LZ4_createStreamHC() and LZ4_freeStreamHC() :
+	 *  These functions create and release memory for LZ4 HC streaming state.
+	 *  Newly created states are automatically initialized.
+	 *  A same state can be used multiple times consecutively,
+	 *  starting with LZ4_resetStreamHC_fast() to start a new stream of blocks.
+	 */
+	createStreamHC :: proc() -> ^streamHC_t ---
+	freeStreamHC :: proc(streamHCPtr: ^streamHC_t) -> c.int ---
+
+	resetStreamHC_fast :: proc(streamHCPtr: ^streamHC_t, compressionLevel: c.int) ---   /* v1.9.0+ */
+	loadDictHC         :: proc(streamHCPtr: ^streamHC_t, dictionary: [^]byte, dictSize: c.int) -> c.int ---
+
+	compress_HC_continue :: proc(streamHCPtr: ^streamHC_t, src, dst: [^]byte, srcSize, maxDstSize: c.int) -> c.int ---
+
+	/*! LZ4_compress_HC_continue_destSize() : v1.9.0+
+	 *  Similar to LZ4_compress_HC_continue(),
+	 *  but will read as much data as possible from `src`
+	 *  to fit into `targetDstSize` budget.
+	 *  Result is provided into 2 parts :
+	 * @return : the number of bytes written into 'dst' (necessarily <= targetDstSize)
+	 *           or 0 if compression fails.
+	 * `srcSizePtr` : on success, *srcSizePtr will be updated to indicate how much bytes were read from `src`.
+	 *           Note that this function may not consume the entire input.
+	 */
+	compress_HC_continue_destSize:: proc(LZ4_streamHCPtr: ^streamHC_t, src, dst: [^]byte, srcSizePtr: ^c.int, targetDstSize: c.int) -> c.int ---
+
+	saveDictHC :: proc(streamHCPtr: ^streamHC_t, safeBuffer: [^]byte, maxDictSize: c.int) -> c.int ---
+
+	/*! LZ4_attach_HC_dictionary() : stable since v1.10.0
+	 *  This API allows for the efficient re-use of a static dictionary many times.
+	 *
+	 *  Rather than re-loading the dictionary buffer into a working context before
+	 *  each compression, or copying a pre-loaded dictionary's LZ4_streamHC_t into a
+	 *  working LZ4_streamHC_t, this function introduces a no-copy setup mechanism,
+	 *  in which the working stream references the dictionary stream in-place.
+	 *
+	 *  Several assumptions are made about the state of the dictionary stream.
+	 *  Currently, only streams which have been prepared by LZ4_loadDictHC() should
+	 *  be expected to work.
+	 *
+	 *  Alternatively, the provided dictionary stream pointer may be NULL, in which
+	 *  case any existing dictionary stream is unset.
+	 *
+	 *  A dictionary should only be attached to a stream without any history (i.e.,
+	 *  a stream that has just been reset).
+	 *
+	 *  The dictionary will remain attached to the working stream only for the
+	 *  current stream session. Calls to LZ4_resetStreamHC(_fast) will remove the
+	 *  dictionary context association from the working stream. The dictionary
+	 *  stream (and source buffer) must remain in-place / accessible / unchanged
+	 *  through the lifetime of the stream session.
+	 */
+	attach_HC_dictionary :: proc(working_stream, dictionary_stream: ^streamHC_t) ---
+}
+
+
+HC_DICTIONARY_LOGSIZE :: 16
+HC_MAXD               :: 1<<HC_DICTIONARY_LOGSIZE
+HC_MAXD_MASK          :: HC_MAXD - 1
+
+HC_HASH_LOG           :: 15
+HC_HASHTABLESIZE      :: 1 << HC_HASH_LOG
+HC_HASH_MASK          :: HC_HASHTABLESIZE - 1
+
+
+streamHC_internal_t :: struct {
+	hashTable:        [HC_HASHTABLESIZE]u32,
+	chainTable:       [HC_MAXD]u16,
+	end:              [^]byte,  /* next block here to continue on current prefix */
+	prefixStart:      [^]byte,  /* Indexes relative to this position */
+	dictStart:        [^]byte,  /* alternate reference for extDict */
+	dictLimit:        u32,      /* below that point, need extDict */
+	lowLimit:         u32,      /* below that point, no more history */
+	nextToUpdate:     u32,      /* index from which to continue dictionary update */
+	compressionLevel: c.short,
+	favorDecSpeed:    i8,       /* favor decompression speed if this flag set,
+	                               otherwise, favor compression ratio */
+	dirty:            i8,       /* stream has to be fully reset if this flag is set */
+	dictCtx:          ^streamHC_internal_t,
+}
+
+STREAMHC_MINSIZE :: 262200
+
+streamHC_t :: struct #raw_union {
+	minStateSize:      [STREAMHC_MINSIZE]byte,
+	internal_donotuse: streamHC_internal_t,
+}
\ No newline at end of file
diff --git a/vendor/compress/lz4/src/lz4.h b/vendor/compress/lz4/src/lz4.h
new file mode 100644
index 000000000..80e3e5ca0
--- /dev/null
+++ b/vendor/compress/lz4/src/lz4.h
@@ -0,0 +1,884 @@
+/*
+ *  LZ4 - Fast LZ compression algorithm
+ *  Header File
+ *  Copyright (C) 2011-2023, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+    - LZ4 homepage : http://www.lz4.org
+    - LZ4 source repository : https://github.com/lz4/lz4
+*/
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef LZ4_H_2983827168210
+#define LZ4_H_2983827168210
+
+/* --- Dependency --- */
+#include <stddef.h>   /* size_t */
+
+
+/**
+  Introduction
+
+  LZ4 is lossless compression algorithm, providing compression speed >500 MB/s per core,
+  scalable with multi-cores CPU. It features an extremely fast decoder, with speed in
+  multiple GB/s per core, typically reaching RAM speed limits on multi-core systems.
+
+  The LZ4 compression library provides in-memory compression and decompression functions.
+  It gives full buffer control to user.
+  Compression can be done in:
+    - a single step (described as Simple Functions)
+    - a single step, reusing a context (described in Advanced Functions)
+    - unbounded multiple steps (described as Streaming compression)
+
+  lz4.h generates and decodes LZ4-compressed blocks (doc/lz4_Block_format.md).
+  Decompressing such a compressed block requires additional metadata.
+  Exact metadata depends on exact decompression function.
+  For the typical case of LZ4_decompress_safe(),
+  metadata includes block's compressed size, and maximum bound of decompressed size.
+  Each application is free to encode and pass such metadata in whichever way it wants.
+
+  lz4.h only handle blocks, it can not generate Frames.
+
+  Blocks are different from Frames (doc/lz4_Frame_format.md).
+  Frames bundle both blocks and metadata in a specified manner.
+  Embedding metadata is required for compressed data to be self-contained and portable.
+  Frame format is delivered through a companion API, declared in lz4frame.h.
+  The `lz4` CLI can only manage frames.
+*/
+
+/*^***************************************************************
+*  Export parameters
+*****************************************************************/
+/*
+*  LZ4_DLL_EXPORT :
+*  Enable exporting of functions when building a Windows DLL
+*  LZ4LIB_VISIBILITY :
+*  Control library symbols visibility.
+*/
+#ifndef LZ4LIB_VISIBILITY
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define LZ4LIB_VISIBILITY __attribute__ ((visibility ("default")))
+#  else
+#    define LZ4LIB_VISIBILITY
+#  endif
+#endif
+#if defined(LZ4_DLL_EXPORT) && (LZ4_DLL_EXPORT==1)
+#  define LZ4LIB_API __declspec(dllexport) LZ4LIB_VISIBILITY
+#elif defined(LZ4_DLL_IMPORT) && (LZ4_DLL_IMPORT==1)
+#  define LZ4LIB_API __declspec(dllimport) LZ4LIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define LZ4LIB_API LZ4LIB_VISIBILITY
+#endif
+
+/*! LZ4_FREESTANDING :
+ *  When this macro is set to 1, it enables "freestanding mode" that is
+ *  suitable for typical freestanding environment which doesn't support
+ *  standard C library.
+ *
+ *  - LZ4_FREESTANDING is a compile-time switch.
+ *  - It requires the following macros to be defined:
+ *    LZ4_memcpy, LZ4_memmove, LZ4_memset.
+ *  - It only enables LZ4/HC functions which don't use heap.
+ *    All LZ4F_* functions are not supported.
+ *  - See tests/freestanding.c to check its basic setup.
+ */
+#if defined(LZ4_FREESTANDING) && (LZ4_FREESTANDING == 1)
+#  define LZ4_HEAPMODE 0
+#  define LZ4HC_HEAPMODE 0
+#  define LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION 1
+#  if !defined(LZ4_memcpy)
+#    error "LZ4_FREESTANDING requires macro 'LZ4_memcpy'."
+#  endif
+#  if !defined(LZ4_memset)
+#    error "LZ4_FREESTANDING requires macro 'LZ4_memset'."
+#  endif
+#  if !defined(LZ4_memmove)
+#    error "LZ4_FREESTANDING requires macro 'LZ4_memmove'."
+#  endif
+#elif ! defined(LZ4_FREESTANDING)
+#  define LZ4_FREESTANDING 0
+#endif
+
+
+/*------   Version   ------*/
+#define LZ4_VERSION_MAJOR    1    /* for breaking interface changes  */
+#define LZ4_VERSION_MINOR   10    /* for new (non-breaking) interface capabilities */
+#define LZ4_VERSION_RELEASE  0    /* for tweaks, bug-fixes, or development */
+
+#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE)
+
+#define LZ4_LIB_VERSION LZ4_VERSION_MAJOR.LZ4_VERSION_MINOR.LZ4_VERSION_RELEASE
+#define LZ4_QUOTE(str) #str
+#define LZ4_EXPAND_AND_QUOTE(str) LZ4_QUOTE(str)
+#define LZ4_VERSION_STRING LZ4_EXPAND_AND_QUOTE(LZ4_LIB_VERSION)  /* requires v1.7.3+ */
+
+LZ4LIB_API int LZ4_versionNumber (void);  /**< library version number; useful to check dll version; requires v1.3.0+ */
+LZ4LIB_API const char* LZ4_versionString (void);   /**< library version string; useful to check dll version; requires v1.7.5+ */
+
+
+/*-************************************
+*  Tuning memory usage
+**************************************/
+/*!
+ * LZ4_MEMORY_USAGE :
+ * Can be selected at compile time, by setting LZ4_MEMORY_USAGE.
+ * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB)
+ * Increasing memory usage improves compression ratio, generally at the cost of speed.
+ * Reduced memory usage may improve speed at the cost of ratio, thanks to better cache locality.
+ * Default value is 14, for 16KB, which nicely fits into most L1 caches.
+ */
+#ifndef LZ4_MEMORY_USAGE
+# define LZ4_MEMORY_USAGE LZ4_MEMORY_USAGE_DEFAULT
+#endif
+
+/* These are absolute limits, they should not be changed by users */
+#define LZ4_MEMORY_USAGE_MIN 10
+#define LZ4_MEMORY_USAGE_DEFAULT 14
+#define LZ4_MEMORY_USAGE_MAX 20
+
+#if (LZ4_MEMORY_USAGE < LZ4_MEMORY_USAGE_MIN)
+#  error "LZ4_MEMORY_USAGE is too small !"
+#endif
+
+#if (LZ4_MEMORY_USAGE > LZ4_MEMORY_USAGE_MAX)
+#  error "LZ4_MEMORY_USAGE is too large !"
+#endif
+
+/*-************************************
+*  Simple Functions
+**************************************/
+/*! LZ4_compress_default() :
+ *  Compresses 'srcSize' bytes from buffer 'src'
+ *  into already allocated 'dst' buffer of size 'dstCapacity'.
+ *  Compression is guaranteed to succeed if 'dstCapacity' >= LZ4_compressBound(srcSize).
+ *  It also runs faster, so it's a recommended setting.
+ *  If the function cannot compress 'src' into a more limited 'dst' budget,
+ *  compression stops *immediately*, and the function result is zero.
+ *  In which case, 'dst' content is undefined (invalid).
+ *      srcSize : max supported value is LZ4_MAX_INPUT_SIZE.
+ *      dstCapacity : size of buffer 'dst' (which must be already allocated)
+ *     @return  : the number of bytes written into buffer 'dst' (necessarily <= dstCapacity)
+ *                or 0 if compression fails
+ * Note : This function is protected against buffer overflow scenarios (never writes outside 'dst' buffer, nor read outside 'source' buffer).
+ */
+LZ4LIB_API int LZ4_compress_default(const char* src, char* dst, int srcSize, int dstCapacity);
+
+/*! LZ4_decompress_safe() :
+ * @compressedSize : is the exact complete size of the compressed block.
+ * @dstCapacity : is the size of destination buffer (which must be already allocated),
+ *                presumed an upper bound of decompressed size.
+ * @return : the number of bytes decompressed into destination buffer (necessarily <= dstCapacity)
+ *           If destination buffer is not large enough, decoding will stop and output an error code (negative value).
+ *           If the source stream is detected malformed, the function will stop decoding and return a negative result.
+ * Note 1 : This function is protected against malicious data packets :
+ *          it will never writes outside 'dst' buffer, nor read outside 'source' buffer,
+ *          even if the compressed block is maliciously modified to order the decoder to do these actions.
+ *          In such case, the decoder stops immediately, and considers the compressed block malformed.
+ * Note 2 : compressedSize and dstCapacity must be provided to the function, the compressed block does not contain them.
+ *          The implementation is free to send / store / derive this information in whichever way is most beneficial.
+ *          If there is a need for a different format which bundles together both compressed data and its metadata, consider looking at lz4frame.h instead.
+ */
+LZ4LIB_API int LZ4_decompress_safe (const char* src, char* dst, int compressedSize, int dstCapacity);
+
+
+/*-************************************
+*  Advanced Functions
+**************************************/
+#define LZ4_MAX_INPUT_SIZE        0x7E000000   /* 2 113 929 216 bytes */
+#define LZ4_COMPRESSBOUND(isize)  ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16)
+
+/*! LZ4_compressBound() :
+    Provides the maximum size that LZ4 compression may output in a "worst case" scenario (input data not compressible)
+    This function is primarily useful for memory allocation purposes (destination buffer size).
+    Macro LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack memory allocation for example).
+    Note that LZ4_compress_default() compresses faster when dstCapacity is >= LZ4_compressBound(srcSize)
+        inputSize  : max supported value is LZ4_MAX_INPUT_SIZE
+        return : maximum output size in a "worst case" scenario
+              or 0, if input size is incorrect (too large or negative)
+*/
+LZ4LIB_API int LZ4_compressBound(int inputSize);
+
+/*! LZ4_compress_fast() :
+    Same as LZ4_compress_default(), but allows selection of "acceleration" factor.
+    The larger the acceleration value, the faster the algorithm, but also the lesser the compression.
+    It's a trade-off. It can be fine tuned, with each successive value providing roughly +~3% to speed.
+    An acceleration value of "1" is the same as regular LZ4_compress_default()
+    Values <= 0 will be replaced by LZ4_ACCELERATION_DEFAULT (currently == 1, see lz4.c).
+    Values > LZ4_ACCELERATION_MAX will be replaced by LZ4_ACCELERATION_MAX (currently == 65537, see lz4.c).
+*/
+LZ4LIB_API int LZ4_compress_fast (const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+
+/*! LZ4_compress_fast_extState() :
+ *  Same as LZ4_compress_fast(), using an externally allocated memory space for its state.
+ *  Use LZ4_sizeofState() to know how much memory must be allocated,
+ *  and allocate it on 8-bytes boundaries (using `malloc()` typically).
+ *  Then, provide this buffer as `void* state` to compression function.
+ */
+LZ4LIB_API int LZ4_sizeofState(void);
+LZ4LIB_API int LZ4_compress_fast_extState (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+/*! LZ4_compress_destSize() :
+ *  Reverse the logic : compresses as much data as possible from 'src' buffer
+ *  into already allocated buffer 'dst', of size >= 'dstCapacity'.
+ *  This function either compresses the entire 'src' content into 'dst' if it's large enough,
+ *  or fill 'dst' buffer completely with as much data as possible from 'src'.
+ *  note: acceleration parameter is fixed to "default".
+ *
+ * *srcSizePtr : in+out parameter. Initially contains size of input.
+ *               Will be modified to indicate how many bytes where read from 'src' to fill 'dst'.
+ *               New value is necessarily <= input value.
+ * @return : Nb bytes written into 'dst' (necessarily <= dstCapacity)
+ *           or 0 if compression fails.
+ *
+ * Note : from v1.8.2 to v1.9.1, this function had a bug (fixed in v1.9.2+):
+ *        the produced compressed content could, in specific circumstances,
+ *        require to be decompressed into a destination buffer larger
+ *        by at least 1 byte than the content to decompress.
+ *        If an application uses `LZ4_compress_destSize()`,
+ *        it's highly recommended to update liblz4 to v1.9.2 or better.
+ *        If this can't be done or ensured,
+ *        the receiving decompression function should provide
+ *        a dstCapacity which is > decompressedSize, by at least 1 byte.
+ *        See https://github.com/lz4/lz4/issues/859 for details
+ */
+LZ4LIB_API int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize);
+
+/*! LZ4_decompress_safe_partial() :
+ *  Decompress an LZ4 compressed block, of size 'srcSize' at position 'src',
+ *  into destination buffer 'dst' of size 'dstCapacity'.
+ *  Up to 'targetOutputSize' bytes will be decoded.
+ *  The function stops decoding on reaching this objective.
+ *  This can be useful to boost performance
+ *  whenever only the beginning of a block is required.
+ *
+ * @return : the number of bytes decoded in `dst` (necessarily <= targetOutputSize)
+ *           If source stream is detected malformed, function returns a negative result.
+ *
+ *  Note 1 : @return can be < targetOutputSize, if compressed block contains less data.
+ *
+ *  Note 2 : targetOutputSize must be <= dstCapacity
+ *
+ *  Note 3 : this function effectively stops decoding on reaching targetOutputSize,
+ *           so dstCapacity is kind of redundant.
+ *           This is because in older versions of this function,
+ *           decoding operation would still write complete sequences.
+ *           Therefore, there was no guarantee that it would stop writing at exactly targetOutputSize,
+ *           it could write more bytes, though only up to dstCapacity.
+ *           Some "margin" used to be required for this operation to work properly.
+ *           Thankfully, this is no longer necessary.
+ *           The function nonetheless keeps the same signature, in an effort to preserve API compatibility.
+ *
+ *  Note 4 : If srcSize is the exact size of the block,
+ *           then targetOutputSize can be any value,
+ *           including larger than the block's decompressed size.
+ *           The function will, at most, generate block's decompressed size.
+ *
+ *  Note 5 : If srcSize is _larger_ than block's compressed size,
+ *           then targetOutputSize **MUST** be <= block's decompressed size.
+ *           Otherwise, *silent corruption will occur*.
+ */
+LZ4LIB_API int LZ4_decompress_safe_partial (const char* src, char* dst, int srcSize, int targetOutputSize, int dstCapacity);
+
+
+/*-*********************************************
+*  Streaming Compression Functions
+***********************************************/
+typedef union LZ4_stream_u LZ4_stream_t;  /* incomplete type (defined later) */
+
+/*!
+ Note about RC_INVOKED
+
+ - RC_INVOKED is predefined symbol of rc.exe (the resource compiler which is part of MSVC/Visual Studio).
+   https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros
+
+ - Since rc.exe is a legacy compiler, it truncates long symbol (> 30 chars)
+   and reports warning "RC4011: identifier truncated".
+
+ - To eliminate the warning, we surround long preprocessor symbol with
+   "#if !defined(RC_INVOKED) ... #endif" block that means
+   "skip this block when rc.exe is trying to read it".
+*/
+#if !defined(RC_INVOKED) /* https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros */
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+LZ4LIB_API LZ4_stream_t* LZ4_createStream(void);
+LZ4LIB_API int           LZ4_freeStream (LZ4_stream_t* streamPtr);
+#endif /* !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) */
+#endif
+
+/*! LZ4_resetStream_fast() : v1.9.0+
+ *  Use this to prepare an LZ4_stream_t for a new chain of dependent blocks
+ *  (e.g., LZ4_compress_fast_continue()).
+ *
+ *  An LZ4_stream_t must be initialized once before usage.
+ *  This is automatically done when created by LZ4_createStream().
+ *  However, should the LZ4_stream_t be simply declared on stack (for example),
+ *  it's necessary to initialize it first, using LZ4_initStream().
+ *
+ *  After init, start any new stream with LZ4_resetStream_fast().
+ *  A same LZ4_stream_t can be re-used multiple times consecutively
+ *  and compress multiple streams,
+ *  provided that it starts each new stream with LZ4_resetStream_fast().
+ *
+ *  LZ4_resetStream_fast() is much faster than LZ4_initStream(),
+ *  but is not compatible with memory regions containing garbage data.
+ *
+ *  Note: it's only useful to call LZ4_resetStream_fast()
+ *        in the context of streaming compression.
+ *        The *extState* functions perform their own resets.
+ *        Invoking LZ4_resetStream_fast() before is redundant, and even counterproductive.
+ */
+LZ4LIB_API void LZ4_resetStream_fast (LZ4_stream_t* streamPtr);
+
+/*! LZ4_loadDict() :
+ *  Use this function to reference a static dictionary into LZ4_stream_t.
+ *  The dictionary must remain available during compression.
+ *  LZ4_loadDict() triggers a reset, so any previous data will be forgotten.
+ *  The same dictionary will have to be loaded on decompression side for successful decoding.
+ *  Dictionary are useful for better compression of small data (KB range).
+ *  While LZ4 itself accepts any input as dictionary, dictionary efficiency is also a topic.
+ *  When in doubt, employ the Zstandard's Dictionary Builder.
+ *  Loading a size of 0 is allowed, and is the same as reset.
+ * @return : loaded dictionary size, in bytes (note: only the last 64 KB are loaded)
+ */
+LZ4LIB_API int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize);
+
+/*! LZ4_loadDictSlow() : v1.10.0+
+ *  Same as LZ4_loadDict(),
+ *  but uses a bit more cpu to reference the dictionary content more thoroughly.
+ *  This is expected to slightly improve compression ratio.
+ *  The extra-cpu cost is likely worth it if the dictionary is re-used across multiple sessions.
+ * @return : loaded dictionary size, in bytes (note: only the last 64 KB are loaded)
+ */
+LZ4LIB_API int LZ4_loadDictSlow(LZ4_stream_t* streamPtr, const char* dictionary, int dictSize);
+
+/*! LZ4_attach_dictionary() : stable since v1.10.0
+ *
+ *  This allows efficient re-use of a static dictionary multiple times.
+ *
+ *  Rather than re-loading the dictionary buffer into a working context before
+ *  each compression, or copying a pre-loaded dictionary's LZ4_stream_t into a
+ *  working LZ4_stream_t, this function introduces a no-copy setup mechanism,
+ *  in which the working stream references @dictionaryStream in-place.
+ *
+ *  Several assumptions are made about the state of @dictionaryStream.
+ *  Currently, only states which have been prepared by LZ4_loadDict() or
+ *  LZ4_loadDictSlow() should be expected to work.
+ *
+ *  Alternatively, the provided @dictionaryStream may be NULL,
+ *  in which case any existing dictionary stream is unset.
+ *
+ *  If a dictionary is provided, it replaces any pre-existing stream history.
+ *  The dictionary contents are the only history that can be referenced and
+ *  logically immediately precede the data compressed in the first subsequent
+ *  compression call.
+ *
+ *  The dictionary will only remain attached to the working stream through the
+ *  first compression call, at the end of which it is cleared.
+ * @dictionaryStream stream (and source buffer) must remain in-place / accessible / unchanged
+ *  through the completion of the compression session.
+ *
+ *  Note: there is no equivalent LZ4_attach_*() method on the decompression side
+ *  because there is no initialization cost, hence no need to share the cost across multiple sessions.
+ *  To decompress LZ4 blocks using dictionary, attached or not,
+ *  just employ the regular LZ4_setStreamDecode() for streaming,
+ *  or the stateless LZ4_decompress_safe_usingDict() for one-shot decompression.
+ */
+LZ4LIB_API void
+LZ4_attach_dictionary(LZ4_stream_t* workingStream,
+                const LZ4_stream_t* dictionaryStream);
+
+/*! LZ4_compress_fast_continue() :
+ *  Compress 'src' content using data from previously compressed blocks, for better compression ratio.
+ * 'dst' buffer must be already allocated.
+ *  If dstCapacity >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster.
+ *
+ * @return : size of compressed block
+ *           or 0 if there is an error (typically, cannot fit into 'dst').
+ *
+ *  Note 1 : Each invocation to LZ4_compress_fast_continue() generates a new block.
+ *           Each block has precise boundaries.
+ *           Each block must be decompressed separately, calling LZ4_decompress_*() with relevant metadata.
+ *           It's not possible to append blocks together and expect a single invocation of LZ4_decompress_*() to decompress them together.
+ *
+ *  Note 2 : The previous 64KB of source data is __assumed__ to remain present, unmodified, at same address in memory !
+ *
+ *  Note 3 : When input is structured as a double-buffer, each buffer can have any size, including < 64 KB.
+ *           Make sure that buffers are separated, by at least one byte.
+ *           This construction ensures that each block only depends on previous block.
+ *
+ *  Note 4 : If input buffer is a ring-buffer, it can have any size, including < 64 KB.
+ *
+ *  Note 5 : After an error, the stream status is undefined (invalid), it can only be reset or freed.
+ */
+LZ4LIB_API int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+/*! LZ4_saveDict() :
+ *  If last 64KB data cannot be guaranteed to remain available at its current memory location,
+ *  save it into a safer place (char* safeBuffer).
+ *  This is schematically equivalent to a memcpy() followed by LZ4_loadDict(),
+ *  but is much faster, because LZ4_saveDict() doesn't need to rebuild tables.
+ * @return : saved dictionary size in bytes (necessarily <= maxDictSize), or 0 if error.
+ */
+LZ4LIB_API int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int maxDictSize);
+
+
+/*-**********************************************
+*  Streaming Decompression Functions
+*  Bufferless synchronous API
+************************************************/
+typedef union LZ4_streamDecode_u LZ4_streamDecode_t;   /* tracking context */
+
+/*! LZ4_createStreamDecode() and LZ4_freeStreamDecode() :
+ *  creation / destruction of streaming decompression tracking context.
+ *  A tracking context can be re-used multiple times.
+ */
+#if !defined(RC_INVOKED) /* https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros */
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+LZ4LIB_API LZ4_streamDecode_t* LZ4_createStreamDecode(void);
+LZ4LIB_API int                 LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream);
+#endif /* !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) */
+#endif
+
+/*! LZ4_setStreamDecode() :
+ *  An LZ4_streamDecode_t context can be allocated once and re-used multiple times.
+ *  Use this function to start decompression of a new stream of blocks.
+ *  A dictionary can optionally be set. Use NULL or size 0 for a reset order.
+ *  Dictionary is presumed stable : it must remain accessible and unmodified during next decompression.
+ * @return : 1 if OK, 0 if error
+ */
+LZ4LIB_API int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize);
+
+/*! LZ4_decoderRingBufferSize() : v1.8.2+
+ *  Note : in a ring buffer scenario (optional),
+ *  blocks are presumed decompressed next to each other
+ *  up to the moment there is not enough remaining space for next block (remainingSize < maxBlockSize),
+ *  at which stage it resumes from beginning of ring buffer.
+ *  When setting such a ring buffer for streaming decompression,
+ *  provides the minimum size of this ring buffer
+ *  to be compatible with any source respecting maxBlockSize condition.
+ * @return : minimum ring buffer size,
+ *           or 0 if there is an error (invalid maxBlockSize).
+ */
+LZ4LIB_API int LZ4_decoderRingBufferSize(int maxBlockSize);
+#define LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize) (65536 + 14 + (maxBlockSize))  /* for static allocation; maxBlockSize presumed valid */
+
+/*! LZ4_decompress_safe_continue() :
+ *  This decoding function allows decompression of consecutive blocks in "streaming" mode.
+ *  The difference with the usual independent blocks is that
+ *  new blocks are allowed to find references into former blocks.
+ *  A block is an unsplittable entity, and must be presented entirely to the decompression function.
+ *  LZ4_decompress_safe_continue() only accepts one block at a time.
+ *  It's modeled after `LZ4_decompress_safe()` and behaves similarly.
+ *
+ * @LZ4_streamDecode : decompression state, tracking the position in memory of past data
+ * @compressedSize : exact complete size of one compressed block.
+ * @dstCapacity : size of destination buffer (which must be already allocated),
+ *                must be an upper bound of decompressed size.
+ * @return : number of bytes decompressed into destination buffer (necessarily <= dstCapacity)
+ *           If destination buffer is not large enough, decoding will stop and output an error code (negative value).
+ *           If the source stream is detected malformed, the function will stop decoding and return a negative result.
+ *
+ *  The last 64KB of previously decoded data *must* remain available and unmodified
+ *  at the memory position where they were previously decoded.
+ *  If less than 64KB of data has been decoded, all the data must be present.
+ *
+ *  Special : if decompression side sets a ring buffer, it must respect one of the following conditions :
+ *  - Decompression buffer size is _at least_ LZ4_decoderRingBufferSize(maxBlockSize).
+ *    maxBlockSize is the maximum size of any single block. It can have any value > 16 bytes.
+ *    In which case, encoding and decoding buffers do not need to be synchronized.
+ *    Actually, data can be produced by any source compliant with LZ4 format specification, and respecting maxBlockSize.
+ *  - Synchronized mode :
+ *    Decompression buffer size is _exactly_ the same as compression buffer size,
+ *    and follows exactly same update rule (block boundaries at same positions),
+ *    and decoding function is provided with exact decompressed size of each block (exception for last block of the stream),
+ *    _then_ decoding & encoding ring buffer can have any size, including small ones ( < 64 KB).
+ *  - Decompression buffer is larger than encoding buffer, by a minimum of maxBlockSize more bytes.
+ *    In which case, encoding and decoding buffers do not need to be synchronized,
+ *    and encoding ring buffer can have any size, including small ones ( < 64 KB).
+ *
+ *  Whenever these conditions are not possible,
+ *  save the last 64KB of decoded data into a safe buffer where it can't be modified during decompression,
+ *  then indicate where this data is saved using LZ4_setStreamDecode(), before decompressing next block.
+*/
+LZ4LIB_API int
+LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode,
+                        const char* src, char* dst,
+                        int srcSize, int dstCapacity);
+
+
+/*! LZ4_decompress_safe_usingDict() :
+ *  Works the same as
+ *  a combination of LZ4_setStreamDecode() followed by LZ4_decompress_safe_continue()
+ *  However, it's stateless: it doesn't need any LZ4_streamDecode_t state.
+ *  Dictionary is presumed stable : it must remain accessible and unmodified during decompression.
+ *  Performance tip : Decompression speed can be substantially increased
+ *                    when dst == dictStart + dictSize.
+ */
+LZ4LIB_API int
+LZ4_decompress_safe_usingDict(const char* src, char* dst,
+                              int srcSize, int dstCapacity,
+                              const char* dictStart, int dictSize);
+
+/*! LZ4_decompress_safe_partial_usingDict() :
+ *  Behaves the same as LZ4_decompress_safe_partial()
+ *  with the added ability to specify a memory segment for past data.
+ *  Performance tip : Decompression speed can be substantially increased
+ *                    when dst == dictStart + dictSize.
+ */
+LZ4LIB_API int
+LZ4_decompress_safe_partial_usingDict(const char* src, char* dst,
+                                      int compressedSize,
+                                      int targetOutputSize, int maxOutputSize,
+                                      const char* dictStart, int dictSize);
+
+#endif /* LZ4_H_2983827168210 */
+
+
+/*^*************************************
+ * !!!!!!   STATIC LINKING ONLY   !!!!!!
+ ***************************************/
+
+/*-****************************************************************************
+ * Experimental section
+ *
+ * Symbols declared in this section must be considered unstable. Their
+ * signatures or semantics may change, or they may be removed altogether in the
+ * future. They are therefore only safe to depend on when the caller is
+ * statically linked against the library.
+ *
+ * To protect against unsafe usage, not only are the declarations guarded,
+ * the definitions are hidden by default
+ * when building LZ4 as a shared/dynamic library.
+ *
+ * In order to access these declarations,
+ * define LZ4_STATIC_LINKING_ONLY in your application
+ * before including LZ4's headers.
+ *
+ * In order to make their implementations accessible dynamically, you must
+ * define LZ4_PUBLISH_STATIC_FUNCTIONS when building the LZ4 library.
+ ******************************************************************************/
+
+#ifdef LZ4_STATIC_LINKING_ONLY
+
+#ifndef LZ4_STATIC_3504398509
+#define LZ4_STATIC_3504398509
+
+#ifdef LZ4_PUBLISH_STATIC_FUNCTIONS
+# define LZ4LIB_STATIC_API LZ4LIB_API
+#else
+# define LZ4LIB_STATIC_API
+#endif
+
+
+/*! LZ4_compress_fast_extState_fastReset() :
+ *  A variant of LZ4_compress_fast_extState().
+ *
+ *  Using this variant avoids an expensive initialization step.
+ *  It is only safe to call if the state buffer is known to be correctly initialized already
+ *  (see above comment on LZ4_resetStream_fast() for a definition of "correctly initialized").
+ *  From a high level, the difference is that
+ *  this function initializes the provided state with a call to something like LZ4_resetStream_fast()
+ *  while LZ4_compress_fast_extState() starts with a call to LZ4_resetStream().
+ */
+LZ4LIB_STATIC_API int LZ4_compress_fast_extState_fastReset (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+/*! LZ4_compress_destSize_extState() : introduced in v1.10.0
+ *  Same as LZ4_compress_destSize(), but using an externally allocated state.
+ *  Also: exposes @acceleration
+ */
+int LZ4_compress_destSize_extState(void* state, const char* src, char* dst, int* srcSizePtr, int targetDstSize, int acceleration);
+
+/*! In-place compression and decompression
+ *
+ * It's possible to have input and output sharing the same buffer,
+ * for highly constrained memory environments.
+ * In both cases, it requires input to lay at the end of the buffer,
+ * and decompression to start at beginning of the buffer.
+ * Buffer size must feature some margin, hence be larger than final size.
+ *
+ * |<------------------------buffer--------------------------------->|
+ *                             |<-----------compressed data--------->|
+ * |<-----------decompressed size------------------>|
+ *                                                  |<----margin---->|
+ *
+ * This technique is more useful for decompression,
+ * since decompressed size is typically larger,
+ * and margin is short.
+ *
+ * In-place decompression will work inside any buffer
+ * which size is >= LZ4_DECOMPRESS_INPLACE_BUFFER_SIZE(decompressedSize).
+ * This presumes that decompressedSize > compressedSize.
+ * Otherwise, it means compression actually expanded data,
+ * and it would be more efficient to store such data with a flag indicating it's not compressed.
+ * This can happen when data is not compressible (already compressed, or encrypted).
+ *
+ * For in-place compression, margin is larger, as it must be able to cope with both
+ * history preservation, requiring input data to remain unmodified up to LZ4_DISTANCE_MAX,
+ * and data expansion, which can happen when input is not compressible.
+ * As a consequence, buffer size requirements are much higher,
+ * and memory savings offered by in-place compression are more limited.
+ *
+ * There are ways to limit this cost for compression :
+ * - Reduce history size, by modifying LZ4_DISTANCE_MAX.
+ *   Note that it is a compile-time constant, so all compressions will apply this limit.
+ *   Lower values will reduce compression ratio, except when input_size < LZ4_DISTANCE_MAX,
+ *   so it's a reasonable trick when inputs are known to be small.
+ * - Require the compressor to deliver a "maximum compressed size".
+ *   This is the `dstCapacity` parameter in `LZ4_compress*()`.
+ *   When this size is < LZ4_COMPRESSBOUND(inputSize), then compression can fail,
+ *   in which case, the return code will be 0 (zero).
+ *   The caller must be ready for these cases to happen,
+ *   and typically design a backup scheme to send data uncompressed.
+ * The combination of both techniques can significantly reduce
+ * the amount of margin required for in-place compression.
+ *
+ * In-place compression can work in any buffer
+ * which size is >= (maxCompressedSize)
+ * with maxCompressedSize == LZ4_COMPRESSBOUND(srcSize) for guaranteed compression success.
+ * LZ4_COMPRESS_INPLACE_BUFFER_SIZE() depends on both maxCompressedSize and LZ4_DISTANCE_MAX,
+ * so it's possible to reduce memory requirements by playing with them.
+ */
+
+#define LZ4_DECOMPRESS_INPLACE_MARGIN(compressedSize)          (((compressedSize) >> 8) + 32)
+#define LZ4_DECOMPRESS_INPLACE_BUFFER_SIZE(decompressedSize)   ((decompressedSize) + LZ4_DECOMPRESS_INPLACE_MARGIN(decompressedSize))  /**< note: presumes that compressedSize < decompressedSize. note2: margin is overestimated a bit, since it could use compressedSize instead */
+
+#ifndef LZ4_DISTANCE_MAX   /* history window size; can be user-defined at compile time */
+#  define LZ4_DISTANCE_MAX 65535   /* set to maximum value by default */
+#endif
+
+#define LZ4_COMPRESS_INPLACE_MARGIN                           (LZ4_DISTANCE_MAX + 32)   /* LZ4_DISTANCE_MAX can be safely replaced by srcSize when it's smaller */
+#define LZ4_COMPRESS_INPLACE_BUFFER_SIZE(maxCompressedSize)   ((maxCompressedSize) + LZ4_COMPRESS_INPLACE_MARGIN)  /**< maxCompressedSize is generally LZ4_COMPRESSBOUND(inputSize), but can be set to any lower value, with the risk that compression can fail (return code 0(zero)) */
+
+#endif   /* LZ4_STATIC_3504398509 */
+#endif   /* LZ4_STATIC_LINKING_ONLY */
+
+
+
+#ifndef LZ4_H_98237428734687
+#define LZ4_H_98237428734687
+
+/*-************************************************************
+ *  Private Definitions
+ **************************************************************
+ * Do not use these definitions directly.
+ * They are only exposed to allow static allocation of `LZ4_stream_t` and `LZ4_streamDecode_t`.
+ * Accessing members will expose user code to API and/or ABI break in future versions of the library.
+ **************************************************************/
+#define LZ4_HASHLOG   (LZ4_MEMORY_USAGE-2)
+#define LZ4_HASHTABLESIZE (1 << LZ4_MEMORY_USAGE)
+#define LZ4_HASH_SIZE_U32 (1 << LZ4_HASHLOG)       /* required as macro for static allocation */
+
+#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+# include <stdint.h>
+  typedef  int8_t  LZ4_i8;
+  typedef uint8_t  LZ4_byte;
+  typedef uint16_t LZ4_u16;
+  typedef uint32_t LZ4_u32;
+#else
+  typedef   signed char  LZ4_i8;
+  typedef unsigned char  LZ4_byte;
+  typedef unsigned short LZ4_u16;
+  typedef unsigned int   LZ4_u32;
+#endif
+
+/*! LZ4_stream_t :
+ *  Never ever use below internal definitions directly !
+ *  These definitions are not API/ABI safe, and may change in future versions.
+ *  If you need static allocation, declare or allocate an LZ4_stream_t object.
+**/
+
+typedef struct LZ4_stream_t_internal LZ4_stream_t_internal;
+struct LZ4_stream_t_internal {
+    LZ4_u32 hashTable[LZ4_HASH_SIZE_U32];
+    const LZ4_byte* dictionary;
+    const LZ4_stream_t_internal* dictCtx;
+    LZ4_u32 currentOffset;
+    LZ4_u32 tableType;
+    LZ4_u32 dictSize;
+    /* Implicit padding to ensure structure is aligned */
+};
+
+#define LZ4_STREAM_MINSIZE  ((1UL << (LZ4_MEMORY_USAGE)) + 32)  /* static size, for inter-version compatibility */
+union LZ4_stream_u {
+    char minStateSize[LZ4_STREAM_MINSIZE];
+    LZ4_stream_t_internal internal_donotuse;
+}; /* previously typedef'd to LZ4_stream_t */
+
+
+/*! LZ4_initStream() : v1.9.0+
+ *  An LZ4_stream_t structure must be initialized at least once.
+ *  This is automatically done when invoking LZ4_createStream(),
+ *  but it's not when the structure is simply declared on stack (for example).
+ *
+ *  Use LZ4_initStream() to properly initialize a newly declared LZ4_stream_t.
+ *  It can also initialize any arbitrary buffer of sufficient size,
+ *  and will @return a pointer of proper type upon initialization.
+ *
+ *  Note : initialization fails if size and alignment conditions are not respected.
+ *         In which case, the function will @return NULL.
+ *  Note2: An LZ4_stream_t structure guarantees correct alignment and size.
+ *  Note3: Before v1.9.0, use LZ4_resetStream() instead
+**/
+LZ4LIB_API LZ4_stream_t* LZ4_initStream (void* stateBuffer, size_t size);
+
+
+/*! LZ4_streamDecode_t :
+ *  Never ever use below internal definitions directly !
+ *  These definitions are not API/ABI safe, and may change in future versions.
+ *  If you need static allocation, declare or allocate an LZ4_streamDecode_t object.
+**/
+typedef struct {
+    const LZ4_byte* externalDict;
+    const LZ4_byte* prefixEnd;
+    size_t extDictSize;
+    size_t prefixSize;
+} LZ4_streamDecode_t_internal;
+
+#define LZ4_STREAMDECODE_MINSIZE 32
+union LZ4_streamDecode_u {
+    char minStateSize[LZ4_STREAMDECODE_MINSIZE];
+    LZ4_streamDecode_t_internal internal_donotuse;
+} ;   /* previously typedef'd to LZ4_streamDecode_t */
+
+
+
+/*-************************************
+*  Obsolete Functions
+**************************************/
+
+/*! Deprecation warnings
+ *
+ *  Deprecated functions make the compiler generate a warning when invoked.
+ *  This is meant to invite users to update their source code.
+ *  Should deprecation warnings be a problem, it is generally possible to disable them,
+ *  typically with -Wno-deprecated-declarations for gcc
+ *  or _CRT_SECURE_NO_WARNINGS in Visual.
+ *
+ *  Another method is to define LZ4_DISABLE_DEPRECATE_WARNINGS
+ *  before including the header file.
+ */
+#ifdef LZ4_DISABLE_DEPRECATE_WARNINGS
+#  define LZ4_DEPRECATED(message)   /* disable deprecation warnings */
+#else
+#  if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
+#    define LZ4_DEPRECATED(message) [[deprecated(message)]]
+#  elif defined(_MSC_VER)
+#    define LZ4_DEPRECATED(message) __declspec(deprecated(message))
+#  elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ * 10 + __GNUC_MINOR__ >= 45))
+#    define LZ4_DEPRECATED(message) __attribute__((deprecated(message)))
+#  elif defined(__GNUC__) && (__GNUC__ * 10 + __GNUC_MINOR__ >= 31)
+#    define LZ4_DEPRECATED(message) __attribute__((deprecated))
+#  else
+#    pragma message("WARNING: LZ4_DEPRECATED needs custom implementation for this compiler")
+#    define LZ4_DEPRECATED(message)   /* disabled */
+#  endif
+#endif /* LZ4_DISABLE_DEPRECATE_WARNINGS */
+
+/*! Obsolete compression functions (since v1.7.3) */
+LZ4_DEPRECATED("use LZ4_compress_default() instead")       LZ4LIB_API int LZ4_compress               (const char* src, char* dest, int srcSize);
+LZ4_DEPRECATED("use LZ4_compress_default() instead")       LZ4LIB_API int LZ4_compress_limitedOutput (const char* src, char* dest, int srcSize, int maxOutputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") LZ4LIB_API int LZ4_compress_withState               (void* state, const char* source, char* dest, int inputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") LZ4LIB_API int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") LZ4LIB_API int LZ4_compress_continue                (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") LZ4LIB_API int LZ4_compress_limitedOutput_continue  (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize);
+
+/*! Obsolete decompression functions (since v1.8.0) */
+LZ4_DEPRECATED("use LZ4_decompress_fast() instead") LZ4LIB_API int LZ4_uncompress (const char* source, char* dest, int outputSize);
+LZ4_DEPRECATED("use LZ4_decompress_safe() instead") LZ4LIB_API int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize);
+
+/* Obsolete streaming functions (since v1.7.0)
+ * degraded functionality; do not use!
+ *
+ * In order to perform streaming compression, these functions depended on data
+ * that is no longer tracked in the state. They have been preserved as well as
+ * possible: using them will still produce a correct output. However, they don't
+ * actually retain any history between compression calls. The compression ratio
+ * achieved will therefore be no better than compressing each chunk
+ * independently.
+ */
+LZ4_DEPRECATED("Use LZ4_createStream() instead") LZ4LIB_API void* LZ4_create (char* inputBuffer);
+LZ4_DEPRECATED("Use LZ4_createStream() instead") LZ4LIB_API int   LZ4_sizeofStreamState(void);
+LZ4_DEPRECATED("Use LZ4_resetStream() instead")  LZ4LIB_API int   LZ4_resetStreamState(void* state, char* inputBuffer);
+LZ4_DEPRECATED("Use LZ4_saveDict() instead")     LZ4LIB_API char* LZ4_slideInputBuffer (void* state);
+
+/*! Obsolete streaming decoding functions (since v1.7.0) */
+LZ4_DEPRECATED("use LZ4_decompress_safe_usingDict() instead") LZ4LIB_API int LZ4_decompress_safe_withPrefix64k (const char* src, char* dst, int compressedSize, int maxDstSize);
+LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") LZ4LIB_API int LZ4_decompress_fast_withPrefix64k (const char* src, char* dst, int originalSize);
+
+/*! Obsolete LZ4_decompress_fast variants (since v1.9.0) :
+ *  These functions used to be faster than LZ4_decompress_safe(),
+ *  but this is no longer the case. They are now slower.
+ *  This is because LZ4_decompress_fast() doesn't know the input size,
+ *  and therefore must progress more cautiously into the input buffer to not read beyond the end of block.
+ *  On top of that `LZ4_decompress_fast()` is not protected vs malformed or malicious inputs, making it a security liability.
+ *  As a consequence, LZ4_decompress_fast() is strongly discouraged, and deprecated.
+ *
+ *  The last remaining LZ4_decompress_fast() specificity is that
+ *  it can decompress a block without knowing its compressed size.
+ *  Such functionality can be achieved in a more secure manner
+ *  by employing LZ4_decompress_safe_partial().
+ *
+ *  Parameters:
+ *  originalSize : is the uncompressed size to regenerate.
+ *                 `dst` must be already allocated, its size must be >= 'originalSize' bytes.
+ * @return : number of bytes read from source buffer (== compressed size).
+ *           The function expects to finish at block's end exactly.
+ *           If the source stream is detected malformed, the function stops decoding and returns a negative result.
+ *  note : LZ4_decompress_fast*() requires originalSize. Thanks to this information, it never writes past the output buffer.
+ *         However, since it doesn't know its 'src' size, it may read an unknown amount of input, past input buffer bounds.
+ *         Also, since match offsets are not validated, match reads from 'src' may underflow too.
+ *         These issues never happen if input (compressed) data is correct.
+ *         But they may happen if input data is invalid (error or intentional tampering).
+ *         As a consequence, use these functions in trusted environments with trusted data **only**.
+ */
+LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe_partial() instead")
+LZ4LIB_API int LZ4_decompress_fast (const char* src, char* dst, int originalSize);
+LZ4_DEPRECATED("This function is deprecated and unsafe. Consider migrating towards LZ4_decompress_safe_continue() instead. "
+               "Note that the contract will change (requires block's compressed size, instead of decompressed size)")
+LZ4LIB_API int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* src, char* dst, int originalSize);
+LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe_partial_usingDict() instead")
+LZ4LIB_API int LZ4_decompress_fast_usingDict (const char* src, char* dst, int originalSize, const char* dictStart, int dictSize);
+
+/*! LZ4_resetStream() :
+ *  An LZ4_stream_t structure must be initialized at least once.
+ *  This is done with LZ4_initStream(), or LZ4_resetStream().
+ *  Consider switching to LZ4_initStream(),
+ *  invoking LZ4_resetStream() will trigger deprecation warnings in the future.
+ */
+LZ4LIB_API void LZ4_resetStream (LZ4_stream_t* streamPtr);
+
+
+#endif /* LZ4_H_98237428734687 */
+
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/vendor/compress/lz4/src/lz4frame.h b/vendor/compress/lz4/src/lz4frame.h
new file mode 100644
index 000000000..b8ae32276
--- /dev/null
+++ b/vendor/compress/lz4/src/lz4frame.h
@@ -0,0 +1,751 @@
+/*
+   LZ4F - LZ4-Frame library
+   Header File
+   Copyright (C) 2011-2020, Yann Collet.
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - LZ4 source repository : https://github.com/lz4/lz4
+   - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+/* LZ4F is a stand-alone API able to create and decode LZ4 frames
+ * conformant with specification v1.6.1 in doc/lz4_Frame_format.md .
+ * Generated frames are compatible with `lz4` CLI.
+ *
+ * LZ4F also offers streaming capabilities.
+ *
+ * lz4.h is not required when using lz4frame.h,
+ * except to extract common constants such as LZ4_VERSION_NUMBER.
+ * */
+
+#ifndef LZ4F_H_09782039843
+#define LZ4F_H_09782039843
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* ---   Dependency   --- */
+#include <stddef.h>   /* size_t */
+
+
+/**
+ * Introduction
+ *
+ * lz4frame.h implements LZ4 frame specification: see doc/lz4_Frame_format.md .
+ * LZ4 Frames are compatible with `lz4` CLI,
+ * and designed to be interoperable with any system.
+**/
+
+/*-***************************************************************
+ *  Compiler specifics
+ *****************************************************************/
+/*  LZ4_DLL_EXPORT :
+ *  Enable exporting of functions when building a Windows DLL
+ *  LZ4FLIB_VISIBILITY :
+ *  Control library symbols visibility.
+ */
+#ifndef LZ4FLIB_VISIBILITY
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define LZ4FLIB_VISIBILITY __attribute__ ((visibility ("default")))
+#  else
+#    define LZ4FLIB_VISIBILITY
+#  endif
+#endif
+#if defined(LZ4_DLL_EXPORT) && (LZ4_DLL_EXPORT==1)
+#  define LZ4FLIB_API __declspec(dllexport) LZ4FLIB_VISIBILITY
+#elif defined(LZ4_DLL_IMPORT) && (LZ4_DLL_IMPORT==1)
+#  define LZ4FLIB_API __declspec(dllimport) LZ4FLIB_VISIBILITY
+#else
+#  define LZ4FLIB_API LZ4FLIB_VISIBILITY
+#endif
+
+#ifdef LZ4F_DISABLE_DEPRECATE_WARNINGS
+#  define LZ4F_DEPRECATE(x) x
+#else
+#  if defined(_MSC_VER)
+#    define LZ4F_DEPRECATE(x) x   /* __declspec(deprecated) x - only works with C++ */
+#  elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 6))
+#    define LZ4F_DEPRECATE(x) x __attribute__((deprecated))
+#  else
+#    define LZ4F_DEPRECATE(x) x   /* no deprecation warning for this compiler */
+#  endif
+#endif
+
+
+/*-************************************
+ *  Error management
+ **************************************/
+typedef size_t LZ4F_errorCode_t;
+
+LZ4FLIB_API unsigned    LZ4F_isError(LZ4F_errorCode_t code);   /**< tells when a function result is an error code */
+LZ4FLIB_API const char* LZ4F_getErrorName(LZ4F_errorCode_t code);   /**< return error code string; for debugging */
+
+
+/*-************************************
+ *  Frame compression types
+ ************************************* */
+/* #define LZ4F_ENABLE_OBSOLETE_ENUMS   // uncomment to enable obsolete enums */
+#ifdef LZ4F_ENABLE_OBSOLETE_ENUMS
+#  define LZ4F_OBSOLETE_ENUM(x) , LZ4F_DEPRECATE(x) = LZ4F_##x
+#else
+#  define LZ4F_OBSOLETE_ENUM(x)
+#endif
+
+/* The larger the block size, the (slightly) better the compression ratio,
+ * though there are diminishing returns.
+ * Larger blocks also increase memory usage on both compression and decompression sides.
+ */
+typedef enum {
+    LZ4F_default=0,
+    LZ4F_max64KB=4,
+    LZ4F_max256KB=5,
+    LZ4F_max1MB=6,
+    LZ4F_max4MB=7
+    LZ4F_OBSOLETE_ENUM(max64KB)
+    LZ4F_OBSOLETE_ENUM(max256KB)
+    LZ4F_OBSOLETE_ENUM(max1MB)
+    LZ4F_OBSOLETE_ENUM(max4MB)
+} LZ4F_blockSizeID_t;
+
+/* Linked blocks sharply reduce inefficiencies when using small blocks,
+ * they compress better.
+ * However, some LZ4 decoders are only compatible with independent blocks */
+typedef enum {
+    LZ4F_blockLinked=0,
+    LZ4F_blockIndependent
+    LZ4F_OBSOLETE_ENUM(blockLinked)
+    LZ4F_OBSOLETE_ENUM(blockIndependent)
+} LZ4F_blockMode_t;
+
+typedef enum {
+    LZ4F_noContentChecksum=0,
+    LZ4F_contentChecksumEnabled
+    LZ4F_OBSOLETE_ENUM(noContentChecksum)
+    LZ4F_OBSOLETE_ENUM(contentChecksumEnabled)
+} LZ4F_contentChecksum_t;
+
+typedef enum {
+    LZ4F_noBlockChecksum=0,
+    LZ4F_blockChecksumEnabled
+} LZ4F_blockChecksum_t;
+
+typedef enum {
+    LZ4F_frame=0,
+    LZ4F_skippableFrame
+    LZ4F_OBSOLETE_ENUM(skippableFrame)
+} LZ4F_frameType_t;
+
+#ifdef LZ4F_ENABLE_OBSOLETE_ENUMS
+typedef LZ4F_blockSizeID_t blockSizeID_t;
+typedef LZ4F_blockMode_t blockMode_t;
+typedef LZ4F_frameType_t frameType_t;
+typedef LZ4F_contentChecksum_t contentChecksum_t;
+#endif
+
+/*! LZ4F_frameInfo_t :
+ *  makes it possible to set or read frame parameters.
+ *  Structure must be first init to 0, using memset() or LZ4F_INIT_FRAMEINFO,
+ *  setting all parameters to default.
+ *  It's then possible to update selectively some parameters */
+typedef struct {
+  LZ4F_blockSizeID_t     blockSizeID;         /* max64KB, max256KB, max1MB, max4MB; 0 == default (LZ4F_max64KB) */
+  LZ4F_blockMode_t       blockMode;           /* LZ4F_blockLinked, LZ4F_blockIndependent; 0 == default (LZ4F_blockLinked) */
+  LZ4F_contentChecksum_t contentChecksumFlag; /* 1: add a 32-bit checksum of frame's decompressed data; 0 == default (disabled) */
+  LZ4F_frameType_t       frameType;           /* read-only field : LZ4F_frame or LZ4F_skippableFrame */
+  unsigned long long     contentSize;         /* Size of uncompressed content ; 0 == unknown */
+  unsigned               dictID;              /* Dictionary ID, sent by compressor to help decoder select correct dictionary; 0 == no dictID provided */
+  LZ4F_blockChecksum_t   blockChecksumFlag;   /* 1: each block followed by a checksum of block's compressed data; 0 == default (disabled) */
+} LZ4F_frameInfo_t;
+
+#define LZ4F_INIT_FRAMEINFO   { LZ4F_max64KB, LZ4F_blockLinked, LZ4F_noContentChecksum, LZ4F_frame, 0ULL, 0U, LZ4F_noBlockChecksum }    /* v1.8.3+ */
+
+/*! LZ4F_preferences_t :
+ *  makes it possible to supply advanced compression instructions to streaming interface.
+ *  Structure must be first init to 0, using memset() or LZ4F_INIT_PREFERENCES,
+ *  setting all parameters to default.
+ *  All reserved fields must be set to zero. */
+typedef struct {
+  LZ4F_frameInfo_t frameInfo;
+  int      compressionLevel;    /* 0: default (fast mode); values > LZ4HC_CLEVEL_MAX count as LZ4HC_CLEVEL_MAX; values < 0 trigger "fast acceleration" */
+  unsigned autoFlush;           /* 1: always flush; reduces usage of internal buffers */
+  unsigned favorDecSpeed;       /* 1: parser favors decompression speed vs compression ratio. Only works for high compression modes (>= LZ4HC_CLEVEL_OPT_MIN) */  /* v1.8.2+ */
+  unsigned reserved[3];         /* must be zero for forward compatibility */
+} LZ4F_preferences_t;
+
+#define LZ4F_INIT_PREFERENCES   { LZ4F_INIT_FRAMEINFO, 0, 0u, 0u, { 0u, 0u, 0u } }    /* v1.8.3+ */
+
+
+/*-*********************************
+*  Simple compression function
+***********************************/
+
+/*! LZ4F_compressFrame() :
+ *  Compress srcBuffer content into an LZ4-compressed frame.
+ *  It's a one shot operation, all input content is consumed, and all output is generated.
+ *
+ *  Note : it's a stateless operation (no LZ4F_cctx state needed).
+ *  In order to reduce load on the allocator, LZ4F_compressFrame(), by default,
+ *  uses the stack to allocate space for the compression state and some table.
+ *  If this usage of the stack is too much for your application,
+ *  consider compiling `lz4frame.c` with compile-time macro LZ4F_HEAPMODE set to 1 instead.
+ *  All state allocations will use the Heap.
+ *  It also means each invocation of LZ4F_compressFrame() will trigger several internal alloc/free invocations.
+ *
+ * @dstCapacity MUST be >= LZ4F_compressFrameBound(srcSize, preferencesPtr).
+ * @preferencesPtr is optional : one can provide NULL, in which case all preferences are set to default.
+ * @return : number of bytes written into dstBuffer.
+ *           or an error code if it fails (can be tested using LZ4F_isError())
+ */
+LZ4FLIB_API size_t LZ4F_compressFrame(void* dstBuffer, size_t dstCapacity,
+                                const void* srcBuffer, size_t srcSize,
+                                const LZ4F_preferences_t* preferencesPtr);
+
+/*! LZ4F_compressFrameBound() :
+ *  Returns the maximum possible compressed size with LZ4F_compressFrame() given srcSize and preferences.
+ * `preferencesPtr` is optional. It can be replaced by NULL, in which case, the function will assume default preferences.
+ *  Note : this result is only usable with LZ4F_compressFrame().
+ *         It may also be relevant to LZ4F_compressUpdate() _only if_ no flush() operation is ever performed.
+ */
+LZ4FLIB_API size_t LZ4F_compressFrameBound(size_t srcSize, const LZ4F_preferences_t* preferencesPtr);
+
+
+/*! LZ4F_compressionLevel_max() :
+ * @return maximum allowed compression level (currently: 12)
+ */
+LZ4FLIB_API int LZ4F_compressionLevel_max(void);   /* v1.8.0+ */
+
+
+/*-***********************************
+*  Advanced compression functions
+*************************************/
+typedef struct LZ4F_cctx_s LZ4F_cctx;   /* incomplete type */
+typedef LZ4F_cctx* LZ4F_compressionContext_t;  /* for compatibility with older APIs, prefer using LZ4F_cctx */
+
+typedef struct {
+  unsigned stableSrc;    /* 1 == src content will remain present on future calls to LZ4F_compress(); skip copying src content within tmp buffer */
+  unsigned reserved[3];
+} LZ4F_compressOptions_t;
+
+/*---   Resource Management   ---*/
+
+#define LZ4F_VERSION 100    /* This number can be used to check for an incompatible API breaking change */
+LZ4FLIB_API unsigned LZ4F_getVersion(void);
+
+/*! LZ4F_createCompressionContext() :
+ *  The first thing to do is to create a compressionContext object,
+ *  which will keep track of operation state during streaming compression.
+ *  This is achieved using LZ4F_createCompressionContext(), which takes as argument a version,
+ *  and a pointer to LZ4F_cctx*, to write the resulting pointer into.
+ *  @version provided MUST be LZ4F_VERSION. It is intended to track potential version mismatch, notably when using DLL.
+ *  The function provides a pointer to a fully allocated LZ4F_cctx object.
+ *  @cctxPtr MUST be != NULL.
+ *  If @return != zero, context creation failed.
+ *  A created compression context can be employed multiple times for consecutive streaming operations.
+ *  Once all streaming compression jobs are completed,
+ *  the state object can be released using LZ4F_freeCompressionContext().
+ *  Note1 : LZ4F_freeCompressionContext() is always successful. Its return value can be ignored.
+ *  Note2 : LZ4F_freeCompressionContext() works fine with NULL input pointers (do nothing).
+**/
+LZ4FLIB_API LZ4F_errorCode_t LZ4F_createCompressionContext(LZ4F_cctx** cctxPtr, unsigned version);
+LZ4FLIB_API LZ4F_errorCode_t LZ4F_freeCompressionContext(LZ4F_cctx* cctx);
+
+
+/*----    Compression    ----*/
+
+#define LZ4F_HEADER_SIZE_MIN  7   /* LZ4 Frame header size can vary, depending on selected parameters */
+#define LZ4F_HEADER_SIZE_MAX 19
+
+/* Size in bytes of a block header in little-endian format. Highest bit indicates if block data is uncompressed */
+#define LZ4F_BLOCK_HEADER_SIZE 4
+
+/* Size in bytes of a block checksum footer in little-endian format. */
+#define LZ4F_BLOCK_CHECKSUM_SIZE 4
+
+/* Size in bytes of the content checksum. */
+#define LZ4F_CONTENT_CHECKSUM_SIZE 4
+
+/*! LZ4F_compressBegin() :
+ *  will write the frame header into dstBuffer.
+ *  dstCapacity must be >= LZ4F_HEADER_SIZE_MAX bytes.
+ * `prefsPtr` is optional : NULL can be provided to set all preferences to default.
+ * @return : number of bytes written into dstBuffer for the header
+ *           or an error code (which can be tested using LZ4F_isError())
+ */
+LZ4FLIB_API size_t LZ4F_compressBegin(LZ4F_cctx* cctx,
+                                      void* dstBuffer, size_t dstCapacity,
+                                      const LZ4F_preferences_t* prefsPtr);
+
+/*! LZ4F_compressBound() :
+ *  Provides minimum dstCapacity required to guarantee success of
+ *  LZ4F_compressUpdate(), given a srcSize and preferences, for a worst case scenario.
+ *  When srcSize==0, LZ4F_compressBound() provides an upper bound for LZ4F_flush() and LZ4F_compressEnd() instead.
+ *  Note that the result is only valid for a single invocation of LZ4F_compressUpdate().
+ *  When invoking LZ4F_compressUpdate() multiple times,
+ *  if the output buffer is gradually filled up instead of emptied and re-used from its start,
+ *  one must check if there is enough remaining capacity before each invocation, using LZ4F_compressBound().
+ * @return is always the same for a srcSize and prefsPtr.
+ *  prefsPtr is optional : when NULL is provided, preferences will be set to cover worst case scenario.
+ *  tech details :
+ * @return if automatic flushing is not enabled, includes the possibility that internal buffer might already be filled by up to (blockSize-1) bytes.
+ *  It also includes frame footer (ending + checksum), since it might be generated by LZ4F_compressEnd().
+ * @return doesn't include frame header, as it was already generated by LZ4F_compressBegin().
+ */
+LZ4FLIB_API size_t LZ4F_compressBound(size_t srcSize, const LZ4F_preferences_t* prefsPtr);
+
+/*! LZ4F_compressUpdate() :
+ *  LZ4F_compressUpdate() can be called repetitively to compress as much data as necessary.
+ *  Important rule: dstCapacity MUST be large enough to ensure operation success even in worst case situations.
+ *  This value is provided by LZ4F_compressBound().
+ *  If this condition is not respected, LZ4F_compress() will fail (result is an errorCode).
+ *  After an error, the state is left in a UB state, and must be re-initialized or freed.
+ *  If previously an uncompressed block was written, buffered data is flushed
+ *  before appending compressed data is continued.
+ * `cOptPtr` is optional : NULL can be provided, in which case all options are set to default.
+ * @return : number of bytes written into `dstBuffer` (it can be zero, meaning input data was just buffered).
+ *           or an error code if it fails (which can be tested using LZ4F_isError())
+ */
+LZ4FLIB_API size_t LZ4F_compressUpdate(LZ4F_cctx* cctx,
+                                       void* dstBuffer, size_t dstCapacity,
+                                 const void* srcBuffer, size_t srcSize,
+                                 const LZ4F_compressOptions_t* cOptPtr);
+
+/*! LZ4F_flush() :
+ *  When data must be generated and sent immediately, without waiting for a block to be completely filled,
+ *  it's possible to call LZ4_flush(). It will immediately compress any data buffered within cctx.
+ * `dstCapacity` must be large enough to ensure the operation will be successful.
+ * `cOptPtr` is optional : it's possible to provide NULL, all options will be set to default.
+ * @return : nb of bytes written into dstBuffer (can be zero, when there is no data stored within cctx)
+ *           or an error code if it fails (which can be tested using LZ4F_isError())
+ *  Note : LZ4F_flush() is guaranteed to be successful when dstCapacity >= LZ4F_compressBound(0, prefsPtr).
+ */
+LZ4FLIB_API size_t LZ4F_flush(LZ4F_cctx* cctx,
+                              void* dstBuffer, size_t dstCapacity,
+                        const LZ4F_compressOptions_t* cOptPtr);
+
+/*! LZ4F_compressEnd() :
+ *  To properly finish an LZ4 frame, invoke LZ4F_compressEnd().
+ *  It will flush whatever data remained within `cctx` (like LZ4_flush())
+ *  and properly finalize the frame, with an endMark and a checksum.
+ * `cOptPtr` is optional : NULL can be provided, in which case all options will be set to default.
+ * @return : nb of bytes written into dstBuffer, necessarily >= 4 (endMark),
+ *           or an error code if it fails (which can be tested using LZ4F_isError())
+ *  Note : LZ4F_compressEnd() is guaranteed to be successful when dstCapacity >= LZ4F_compressBound(0, prefsPtr).
+ *  A successful call to LZ4F_compressEnd() makes `cctx` available again for another compression task.
+ */
+LZ4FLIB_API size_t LZ4F_compressEnd(LZ4F_cctx* cctx,
+                                    void* dstBuffer, size_t dstCapacity,
+                              const LZ4F_compressOptions_t* cOptPtr);
+
+
+/*-*********************************
+*  Decompression functions
+***********************************/
+typedef struct LZ4F_dctx_s LZ4F_dctx;   /* incomplete type */
+typedef LZ4F_dctx* LZ4F_decompressionContext_t;   /* compatibility with previous API versions */
+
+typedef struct {
+  unsigned stableDst;     /* pledges that last 64KB decompressed data is present right before @dstBuffer pointer.
+                           * This optimization skips internal storage operations.
+                           * Once set, this pledge must remain valid up to the end of current frame. */
+  unsigned skipChecksums; /* disable checksum calculation and verification, even when one is present in frame, to save CPU time.
+                           * Setting this option to 1 once disables all checksums for the rest of the frame. */
+  unsigned reserved1;     /* must be set to zero for forward compatibility */
+  unsigned reserved0;     /* idem */
+} LZ4F_decompressOptions_t;
+
+
+/* Resource management */
+
+/*! LZ4F_createDecompressionContext() :
+ *  Create an LZ4F_dctx object, to track all decompression operations.
+ *  @version provided MUST be LZ4F_VERSION.
+ *  @dctxPtr MUST be valid.
+ *  The function fills @dctxPtr with the value of a pointer to an allocated and initialized LZ4F_dctx object.
+ *  The @return is an errorCode, which can be tested using LZ4F_isError().
+ *  dctx memory can be released using LZ4F_freeDecompressionContext();
+ *  Result of LZ4F_freeDecompressionContext() indicates current state of decompressionContext when being released.
+ *  That is, it should be == 0 if decompression has been completed fully and correctly.
+ */
+LZ4FLIB_API LZ4F_errorCode_t LZ4F_createDecompressionContext(LZ4F_dctx** dctxPtr, unsigned version);
+LZ4FLIB_API LZ4F_errorCode_t LZ4F_freeDecompressionContext(LZ4F_dctx* dctx);
+
+
+/*-***********************************
+*  Streaming decompression functions
+*************************************/
+
+#define LZ4F_MAGICNUMBER 0x184D2204U
+#define LZ4F_MAGIC_SKIPPABLE_START 0x184D2A50U
+#define LZ4F_MIN_SIZE_TO_KNOW_HEADER_LENGTH 5
+
+/*! LZ4F_headerSize() : v1.9.0+
+ *  Provide the header size of a frame starting at `src`.
+ * `srcSize` must be >= LZ4F_MIN_SIZE_TO_KNOW_HEADER_LENGTH,
+ *  which is enough to decode the header length.
+ * @return : size of frame header
+ *           or an error code, which can be tested using LZ4F_isError()
+ *  note : Frame header size is variable, but is guaranteed to be
+ *         >= LZ4F_HEADER_SIZE_MIN bytes, and <= LZ4F_HEADER_SIZE_MAX bytes.
+ */
+LZ4FLIB_API size_t LZ4F_headerSize(const void* src, size_t srcSize);
+
+/*! LZ4F_getFrameInfo() :
+ *  This function extracts frame parameters (max blockSize, dictID, etc.).
+ *  Its usage is optional: user can also invoke LZ4F_decompress() directly.
+ *
+ *  Extracted information will fill an existing LZ4F_frameInfo_t structure.
+ *  This can be useful for allocation and dictionary identification purposes.
+ *
+ *  LZ4F_getFrameInfo() can work in the following situations :
+ *
+ *  1) At the beginning of a new frame, before any invocation of LZ4F_decompress().
+ *     It will decode header from `srcBuffer`,
+ *     consuming the header and starting the decoding process.
+ *
+ *     Input size must be large enough to contain the full frame header.
+ *     Frame header size can be known beforehand by LZ4F_headerSize().
+ *     Frame header size is variable, but is guaranteed to be >= LZ4F_HEADER_SIZE_MIN bytes,
+ *     and not more than <= LZ4F_HEADER_SIZE_MAX bytes.
+ *     Hence, blindly providing LZ4F_HEADER_SIZE_MAX bytes or more will always work.
+ *     It's allowed to provide more input data than the header size,
+ *     LZ4F_getFrameInfo() will only consume the header.
+ *
+ *     If input size is not large enough,
+ *     aka if it's smaller than header size,
+ *     function will fail and return an error code.
+ *
+ *  2) After decoding has been started,
+ *     it's possible to invoke LZ4F_getFrameInfo() anytime
+ *     to extract already decoded frame parameters stored within dctx.
+ *
+ *     Note that, if decoding has barely started,
+ *     and not yet read enough information to decode the header,
+ *     LZ4F_getFrameInfo() will fail.
+ *
+ *  The number of bytes consumed from srcBuffer will be updated in *srcSizePtr (necessarily <= original value).
+ *  LZ4F_getFrameInfo() only consumes bytes when decoding has not yet started,
+ *  and when decoding the header has been successful.
+ *  Decompression must then resume from (srcBuffer + *srcSizePtr).
+ *
+ * @return : a hint about how many srcSize bytes LZ4F_decompress() expects for next call,
+ *           or an error code which can be tested using LZ4F_isError().
+ *  note 1 : in case of error, dctx is not modified. Decoding operation can resume from beginning safely.
+ *  note 2 : frame parameters are *copied into* an already allocated LZ4F_frameInfo_t structure.
+ */
+LZ4FLIB_API size_t
+LZ4F_getFrameInfo(LZ4F_dctx* dctx,
+                  LZ4F_frameInfo_t* frameInfoPtr,
+            const void* srcBuffer, size_t* srcSizePtr);
+
+/*! LZ4F_decompress() :
+ *  Call this function repetitively to regenerate data compressed in `srcBuffer`.
+ *
+ *  The function requires a valid dctx state.
+ *  It will read up to *srcSizePtr bytes from srcBuffer,
+ *  and decompress data into dstBuffer, of capacity *dstSizePtr.
+ *
+ *  The nb of bytes consumed from srcBuffer will be written into *srcSizePtr (necessarily <= original value).
+ *  The nb of bytes decompressed into dstBuffer will be written into *dstSizePtr (necessarily <= original value).
+ *
+ *  The function does not necessarily read all input bytes, so always check value in *srcSizePtr.
+ *  Unconsumed source data must be presented again in subsequent invocations.
+ *
+ * `dstBuffer` can freely change between each consecutive function invocation.
+ * `dstBuffer` content will be overwritten.
+ *
+ *  Note: if `LZ4F_getFrameInfo()` is called before `LZ4F_decompress()`, srcBuffer must be updated to reflect
+ *  the number of bytes consumed after reading the frame header. Failure to update srcBuffer before calling
+ *  `LZ4F_decompress()` will cause decompression failure or, even worse, successful but incorrect decompression.
+ *  See the `LZ4F_getFrameInfo()` docs for details.
+ *
+ * @return : an hint of how many `srcSize` bytes LZ4F_decompress() expects for next call.
+ *  Schematically, it's the size of the current (or remaining) compressed block + header of next block.
+ *  Respecting the hint provides some small speed benefit, because it skips intermediate buffers.
+ *  This is just a hint though, it's always possible to provide any srcSize.
+ *
+ *  When a frame is fully decoded, @return will be 0 (no more data expected).
+ *  When provided with more bytes than necessary to decode a frame,
+ *  LZ4F_decompress() will stop reading exactly at end of current frame, and @return 0.
+ *
+ *  If decompression failed, @return is an error code, which can be tested using LZ4F_isError().
+ *  After a decompression error, the `dctx` context is not resumable.
+ *  Use LZ4F_resetDecompressionContext() to return to clean state.
+ *
+ *  After a frame is fully decoded, dctx can be used again to decompress another frame.
+ */
+LZ4FLIB_API size_t
+LZ4F_decompress(LZ4F_dctx* dctx,
+                void* dstBuffer, size_t* dstSizePtr,
+          const void* srcBuffer, size_t* srcSizePtr,
+          const LZ4F_decompressOptions_t* dOptPtr);
+
+
+/*! LZ4F_resetDecompressionContext() : added in v1.8.0
+ *  In case of an error, the context is left in "undefined" state.
+ *  In which case, it's necessary to reset it, before re-using it.
+ *  This method can also be used to abruptly stop any unfinished decompression,
+ *  and start a new one using same context resources. */
+LZ4FLIB_API void LZ4F_resetDecompressionContext(LZ4F_dctx* dctx);   /* always successful */
+
+
+/**********************************
+ *  Dictionary compression API
+ *********************************/
+
+/* A Dictionary is useful for the compression of small messages (KB range).
+ * It dramatically improves compression efficiency.
+ *
+ * LZ4 can ingest any input as dictionary, though only the last 64 KB are useful.
+ * Better results are generally achieved by using Zstandard's Dictionary Builder
+ * to generate a high-quality dictionary from a set of samples.
+ *
+ * The same dictionary will have to be used on the decompression side
+ * for decoding to be successful.
+ * To help identify the correct dictionary at decoding stage,
+ * the frame header allows optional embedding of a dictID field.
+ */
+
+/*! LZ4F_compressBegin_usingDict() : stable since v1.10
+ *  Inits dictionary compression streaming, and writes the frame header into dstBuffer.
+ * @dstCapacity must be >= LZ4F_HEADER_SIZE_MAX bytes.
+ * @prefsPtr is optional : one may provide NULL as argument,
+ *  however, it's the only way to provide dictID in the frame header.
+ * @dictBuffer must outlive the compression session.
+ * @return : number of bytes written into dstBuffer for the header,
+ *           or an error code (which can be tested using LZ4F_isError())
+ *  NOTE: The LZ4Frame spec allows each independent block to be compressed with the dictionary,
+ *        but this entry supports a more limited scenario, where only the first block uses the dictionary.
+ *        This is still useful for small data, which only need one block anyway.
+ *        For larger inputs, one may be more interested in LZ4F_compressFrame_usingCDict() below.
+ */
+LZ4FLIB_API size_t
+LZ4F_compressBegin_usingDict(LZ4F_cctx* cctx,
+                            void* dstBuffer, size_t dstCapacity,
+                      const void* dictBuffer, size_t dictSize,
+                      const LZ4F_preferences_t* prefsPtr);
+
+/*! LZ4F_decompress_usingDict() : stable since v1.10
+ *  Same as LZ4F_decompress(), using a predefined dictionary.
+ *  Dictionary is used "in place", without any preprocessing.
+**  It must remain accessible throughout the entire frame decoding. */
+LZ4FLIB_API size_t
+LZ4F_decompress_usingDict(LZ4F_dctx* dctxPtr,
+                          void* dstBuffer, size_t* dstSizePtr,
+                    const void* srcBuffer, size_t* srcSizePtr,
+                    const void* dict, size_t dictSize,
+                    const LZ4F_decompressOptions_t* decompressOptionsPtr);
+
+/*****************************************
+ *  Bulk processing dictionary compression
+ *****************************************/
+
+/* Loading a dictionary has a cost, since it involves construction of tables.
+ * The Bulk processing dictionary API makes it possible to share this cost
+ * over an arbitrary number of compression jobs, even concurrently,
+ * markedly improving compression latency for these cases.
+ *
+ * Note that there is no corresponding bulk API for the decompression side,
+ * because dictionary does not carry any initialization cost for decompression.
+ * Use the regular LZ4F_decompress_usingDict() there.
+ */
+typedef struct LZ4F_CDict_s LZ4F_CDict;
+
+/*! LZ4_createCDict() : stable since v1.10
+ *  When compressing multiple messages / blocks using the same dictionary, it's recommended to initialize it just once.
+ *  LZ4_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay.
+ *  LZ4_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
+ * @dictBuffer can be released after LZ4_CDict creation, since its content is copied within CDict. */
+LZ4FLIB_API LZ4F_CDict* LZ4F_createCDict(const void* dictBuffer, size_t dictSize);
+LZ4FLIB_API void        LZ4F_freeCDict(LZ4F_CDict* CDict);
+
+/*! LZ4_compressFrame_usingCDict() : stable since v1.10
+ *  Compress an entire srcBuffer into a valid LZ4 frame using a digested Dictionary.
+ * @cctx must point to a context created by LZ4F_createCompressionContext().
+ *  If @cdict==NULL, compress without a dictionary.
+ * @dstBuffer MUST be >= LZ4F_compressFrameBound(srcSize, preferencesPtr).
+ *  If this condition is not respected, function will fail (@return an errorCode).
+ *  The LZ4F_preferences_t structure is optional : one may provide NULL as argument,
+ *  but it's not recommended, as it's the only way to provide @dictID in the frame header.
+ * @return : number of bytes written into dstBuffer.
+ *           or an error code if it fails (can be tested using LZ4F_isError())
+ *  Note: for larger inputs generating multiple independent blocks,
+ *        this entry point uses the dictionary for each block. */
+LZ4FLIB_API size_t
+LZ4F_compressFrame_usingCDict(LZ4F_cctx* cctx,
+                              void* dst, size_t dstCapacity,
+                        const void* src, size_t srcSize,
+                        const LZ4F_CDict* cdict,
+                        const LZ4F_preferences_t* preferencesPtr);
+
+/*! LZ4F_compressBegin_usingCDict() : stable since v1.10
+ *  Inits streaming dictionary compression, and writes the frame header into dstBuffer.
+ * @dstCapacity must be >= LZ4F_HEADER_SIZE_MAX bytes.
+ * @prefsPtr is optional : one may provide NULL as argument,
+ *  note however that it's the only way to insert a @dictID in the frame header.
+ * @cdict must outlive the compression session.
+ * @return : number of bytes written into dstBuffer for the header,
+ *           or an error code, which can be tested using LZ4F_isError(). */
+LZ4FLIB_API size_t
+LZ4F_compressBegin_usingCDict(LZ4F_cctx* cctx,
+                              void* dstBuffer, size_t dstCapacity,
+                        const LZ4F_CDict* cdict,
+                        const LZ4F_preferences_t* prefsPtr);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* LZ4F_H_09782039843 */
+
+#if defined(LZ4F_STATIC_LINKING_ONLY) && !defined(LZ4F_H_STATIC_09782039843)
+#define LZ4F_H_STATIC_09782039843
+
+/* Note :
+ * The below declarations are not stable and may change in the future.
+ * They are therefore only safe to depend on
+ * when the caller is statically linked against the library.
+ * To access their declarations, define LZ4F_STATIC_LINKING_ONLY.
+ *
+ * By default, these symbols aren't published into shared/dynamic libraries.
+ * You can override this behavior and force them to be published
+ * by defining LZ4F_PUBLISH_STATIC_FUNCTIONS.
+ * Use at your own risk.
+ */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifdef LZ4F_PUBLISH_STATIC_FUNCTIONS
+# define LZ4FLIB_STATIC_API LZ4FLIB_API
+#else
+# define LZ4FLIB_STATIC_API
+#endif
+
+
+/* ---   Error List   --- */
+#define LZ4F_LIST_ERRORS(ITEM) \
+        ITEM(OK_NoError) \
+        ITEM(ERROR_GENERIC) \
+        ITEM(ERROR_maxBlockSize_invalid) \
+        ITEM(ERROR_blockMode_invalid) \
+        ITEM(ERROR_parameter_invalid) \
+        ITEM(ERROR_compressionLevel_invalid) \
+        ITEM(ERROR_headerVersion_wrong) \
+        ITEM(ERROR_blockChecksum_invalid) \
+        ITEM(ERROR_reservedFlag_set) \
+        ITEM(ERROR_allocation_failed) \
+        ITEM(ERROR_srcSize_tooLarge) \
+        ITEM(ERROR_dstMaxSize_tooSmall) \
+        ITEM(ERROR_frameHeader_incomplete) \
+        ITEM(ERROR_frameType_unknown) \
+        ITEM(ERROR_frameSize_wrong) \
+        ITEM(ERROR_srcPtr_wrong) \
+        ITEM(ERROR_decompressionFailed) \
+        ITEM(ERROR_headerChecksum_invalid) \
+        ITEM(ERROR_contentChecksum_invalid) \
+        ITEM(ERROR_frameDecoding_alreadyStarted) \
+        ITEM(ERROR_compressionState_uninitialized) \
+        ITEM(ERROR_parameter_null) \
+        ITEM(ERROR_io_write) \
+        ITEM(ERROR_io_read) \
+        ITEM(ERROR_maxCode)
+
+#define LZ4F_GENERATE_ENUM(ENUM) LZ4F_##ENUM,
+
+/* enum list is exposed, to handle specific errors */
+typedef enum { LZ4F_LIST_ERRORS(LZ4F_GENERATE_ENUM)
+              _LZ4F_dummy_error_enum_for_c89_never_used } LZ4F_errorCodes;
+
+LZ4FLIB_STATIC_API LZ4F_errorCodes LZ4F_getErrorCode(size_t functionResult);
+
+/**********************************
+ *  Advanced compression operations
+ *********************************/
+
+/*! LZ4F_getBlockSize() :
+ * @return, in scalar format (size_t),
+ *          the maximum block size associated with @blockSizeID,
+ *          or an error code (can be tested using LZ4F_isError()) if @blockSizeID is invalid.
+**/
+LZ4FLIB_STATIC_API size_t LZ4F_getBlockSize(LZ4F_blockSizeID_t blockSizeID);
+
+/*! LZ4F_uncompressedUpdate() :
+ *  LZ4F_uncompressedUpdate() can be called repetitively to add data stored as uncompressed blocks.
+ *  Important rule: dstCapacity MUST be large enough to store the entire source buffer as
+ *  no compression is done for this operation
+ *  If this condition is not respected, LZ4F_uncompressedUpdate() will fail (result is an errorCode).
+ *  After an error, the state is left in a UB state, and must be re-initialized or freed.
+ *  If previously a compressed block was written, buffered data is flushed first,
+ *  before appending uncompressed data is continued.
+ *  This operation is only supported when LZ4F_blockIndependent is used.
+ * `cOptPtr` is optional : NULL can be provided, in which case all options are set to default.
+ * @return : number of bytes written into `dstBuffer` (it can be zero, meaning input data was just buffered).
+ *           or an error code if it fails (which can be tested using LZ4F_isError())
+ */
+LZ4FLIB_STATIC_API size_t
+LZ4F_uncompressedUpdate(LZ4F_cctx* cctx,
+                        void* dstBuffer, size_t dstCapacity,
+                  const void* srcBuffer, size_t srcSize,
+                  const LZ4F_compressOptions_t* cOptPtr);
+
+/**********************************
+ *  Custom memory allocation
+ *********************************/
+
+/*! Custom memory allocation : v1.9.4+
+ *  These prototypes make it possible to pass custom allocation/free functions.
+ *  LZ4F_customMem is provided at state creation time, using LZ4F_create*_advanced() listed below.
+ *  All allocation/free operations will be completed using these custom variants instead of regular <stdlib.h> ones.
+ */
+typedef void* (*LZ4F_AllocFunction) (void* opaqueState, size_t size);
+typedef void* (*LZ4F_CallocFunction) (void* opaqueState, size_t size);
+typedef void  (*LZ4F_FreeFunction) (void* opaqueState, void* address);
+typedef struct {
+    LZ4F_AllocFunction customAlloc;
+    LZ4F_CallocFunction customCalloc; /* optional; when not defined, uses customAlloc + memset */
+    LZ4F_FreeFunction customFree;
+    void* opaqueState;
+} LZ4F_CustomMem;
+static
+#ifdef __GNUC__
+__attribute__((__unused__))
+#endif
+LZ4F_CustomMem const LZ4F_defaultCMem = { NULL, NULL, NULL, NULL };  /**< this constant defers to stdlib's functions */
+
+LZ4FLIB_STATIC_API LZ4F_cctx* LZ4F_createCompressionContext_advanced(LZ4F_CustomMem customMem, unsigned version);
+LZ4FLIB_STATIC_API LZ4F_dctx* LZ4F_createDecompressionContext_advanced(LZ4F_CustomMem customMem, unsigned version);
+LZ4FLIB_STATIC_API LZ4F_CDict* LZ4F_createCDict_advanced(LZ4F_CustomMem customMem, const void* dictBuffer, size_t dictSize);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* defined(LZ4F_STATIC_LINKING_ONLY) && !defined(LZ4F_H_STATIC_09782039843) */
diff --git a/vendor/compress/lz4/src/lz4hc.h b/vendor/compress/lz4/src/lz4hc.h
new file mode 100644
index 000000000..992bc8cdd
--- /dev/null
+++ b/vendor/compress/lz4/src/lz4hc.h
@@ -0,0 +1,414 @@
+/*
+   LZ4 HC - High Compression Mode of LZ4
+   Header File
+   Copyright (C) 2011-2020, Yann Collet.
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - LZ4 source repository : https://github.com/lz4/lz4
+   - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+#ifndef LZ4_HC_H_19834876238432
+#define LZ4_HC_H_19834876238432
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* --- Dependency --- */
+/* note : lz4hc requires lz4.h/lz4.c for compilation */
+#include "lz4.h"   /* stddef, LZ4LIB_API, LZ4_DEPRECATED */
+
+
+/* --- Useful constants --- */
+#define LZ4HC_CLEVEL_MIN         2
+#define LZ4HC_CLEVEL_DEFAULT     9
+#define LZ4HC_CLEVEL_OPT_MIN    10
+#define LZ4HC_CLEVEL_MAX        12
+
+
+/*-************************************
+ *  Block Compression
+ **************************************/
+/*! LZ4_compress_HC() :
+ *  Compress data from `src` into `dst`, using the powerful but slower "HC" algorithm.
+ * `dst` must be already allocated.
+ *  Compression is guaranteed to succeed if `dstCapacity >= LZ4_compressBound(srcSize)` (see "lz4.h")
+ *  Max supported `srcSize` value is LZ4_MAX_INPUT_SIZE (see "lz4.h")
+ * `compressionLevel` : any value between 1 and LZ4HC_CLEVEL_MAX will work.
+ *                      Values > LZ4HC_CLEVEL_MAX behave the same as LZ4HC_CLEVEL_MAX.
+ * @return : the number of bytes written into 'dst'
+ *           or 0 if compression fails.
+ */
+LZ4LIB_API int LZ4_compress_HC (const char* src, char* dst, int srcSize, int dstCapacity, int compressionLevel);
+
+
+/* Note :
+ *   Decompression functions are provided within "lz4.h" (BSD license)
+ */
+
+
+/*! LZ4_compress_HC_extStateHC() :
+ *  Same as LZ4_compress_HC(), but using an externally allocated memory segment for `state`.
+ * `state` size is provided by LZ4_sizeofStateHC().
+ *  Memory segment must be aligned on 8-bytes boundaries (which a normal malloc() should do properly).
+ */
+LZ4LIB_API int LZ4_sizeofStateHC(void);
+LZ4LIB_API int LZ4_compress_HC_extStateHC(void* stateHC, const char* src, char* dst, int srcSize, int maxDstSize, int compressionLevel);
+
+
+/*! LZ4_compress_HC_destSize() : v1.9.0+
+ *  Will compress as much data as possible from `src`
+ *  to fit into `targetDstSize` budget.
+ *  Result is provided in 2 parts :
+ * @return : the number of bytes written into 'dst' (necessarily <= targetDstSize)
+ *           or 0 if compression fails.
+ * `srcSizePtr` : on success, *srcSizePtr is updated to indicate how much bytes were read from `src`
+ */
+LZ4LIB_API int LZ4_compress_HC_destSize(void* stateHC,
+                                  const char* src, char* dst,
+                                        int* srcSizePtr, int targetDstSize,
+                                        int compressionLevel);
+
+
+/*-************************************
+ *  Streaming Compression
+ *  Bufferless synchronous API
+ **************************************/
+ typedef union LZ4_streamHC_u LZ4_streamHC_t;   /* incomplete type (defined later) */
+
+/*! LZ4_createStreamHC() and LZ4_freeStreamHC() :
+ *  These functions create and release memory for LZ4 HC streaming state.
+ *  Newly created states are automatically initialized.
+ *  A same state can be used multiple times consecutively,
+ *  starting with LZ4_resetStreamHC_fast() to start a new stream of blocks.
+ */
+LZ4LIB_API LZ4_streamHC_t* LZ4_createStreamHC(void);
+LZ4LIB_API int             LZ4_freeStreamHC (LZ4_streamHC_t* streamHCPtr);
+
+/*
+  These functions compress data in successive blocks of any size,
+  using previous blocks as dictionary, to improve compression ratio.
+  One key assumption is that previous blocks (up to 64 KB) remain read-accessible while compressing next blocks.
+  There is an exception for ring buffers, which can be smaller than 64 KB.
+  Ring-buffer scenario is automatically detected and handled within LZ4_compress_HC_continue().
+
+  Before starting compression, state must be allocated and properly initialized.
+  LZ4_createStreamHC() does both, though compression level is set to LZ4HC_CLEVEL_DEFAULT.
+
+  Selecting the compression level can be done with LZ4_resetStreamHC_fast() (starts a new stream)
+  or LZ4_setCompressionLevel() (anytime, between blocks in the same stream) (experimental).
+  LZ4_resetStreamHC_fast() only works on states which have been properly initialized at least once,
+  which is automatically the case when state is created using LZ4_createStreamHC().
+
+  After reset, a first "fictional block" can be designated as initial dictionary,
+  using LZ4_loadDictHC() (Optional).
+  Note: In order for LZ4_loadDictHC() to create the correct data structure,
+  it is essential to set the compression level _before_ loading the dictionary.
+
+  Invoke LZ4_compress_HC_continue() to compress each successive block.
+  The number of blocks is unlimited.
+  Previous input blocks, including initial dictionary when present,
+  must remain accessible and unmodified during compression.
+
+  It's allowed to update compression level anytime between blocks,
+  using LZ4_setCompressionLevel() (experimental).
+
+ @dst buffer should be sized to handle worst case scenarios
+  (see LZ4_compressBound(), it ensures compression success).
+  In case of failure, the API does not guarantee recovery,
+  so the state _must_ be reset.
+  To ensure compression success
+  whenever @dst buffer size cannot be made >= LZ4_compressBound(),
+  consider using LZ4_compress_HC_continue_destSize().
+
+  Whenever previous input blocks can't be preserved unmodified in-place during compression of next blocks,
+  it's possible to copy the last blocks into a more stable memory space, using LZ4_saveDictHC().
+  Return value of LZ4_saveDictHC() is the size of dictionary effectively saved into 'safeBuffer' (<= 64 KB)
+
+  After completing a streaming compression,
+  it's possible to start a new stream of blocks, using the same LZ4_streamHC_t state,
+  just by resetting it, using LZ4_resetStreamHC_fast().
+*/
+
+LZ4LIB_API void LZ4_resetStreamHC_fast(LZ4_streamHC_t* streamHCPtr, int compressionLevel);   /* v1.9.0+ */
+LZ4LIB_API int  LZ4_loadDictHC (LZ4_streamHC_t* streamHCPtr, const char* dictionary, int dictSize);
+
+LZ4LIB_API int LZ4_compress_HC_continue (LZ4_streamHC_t* streamHCPtr,
+                                   const char* src, char* dst,
+                                         int srcSize, int maxDstSize);
+
+/*! LZ4_compress_HC_continue_destSize() : v1.9.0+
+ *  Similar to LZ4_compress_HC_continue(),
+ *  but will read as much data as possible from `src`
+ *  to fit into `targetDstSize` budget.
+ *  Result is provided into 2 parts :
+ * @return : the number of bytes written into 'dst' (necessarily <= targetDstSize)
+ *           or 0 if compression fails.
+ * `srcSizePtr` : on success, *srcSizePtr will be updated to indicate how much bytes were read from `src`.
+ *           Note that this function may not consume the entire input.
+ */
+LZ4LIB_API int LZ4_compress_HC_continue_destSize(LZ4_streamHC_t* LZ4_streamHCPtr,
+                                           const char* src, char* dst,
+                                                 int* srcSizePtr, int targetDstSize);
+
+LZ4LIB_API int LZ4_saveDictHC (LZ4_streamHC_t* streamHCPtr, char* safeBuffer, int maxDictSize);
+
+
+/*! LZ4_attach_HC_dictionary() : stable since v1.10.0
+ *  This API allows for the efficient re-use of a static dictionary many times.
+ *
+ *  Rather than re-loading the dictionary buffer into a working context before
+ *  each compression, or copying a pre-loaded dictionary's LZ4_streamHC_t into a
+ *  working LZ4_streamHC_t, this function introduces a no-copy setup mechanism,
+ *  in which the working stream references the dictionary stream in-place.
+ *
+ *  Several assumptions are made about the state of the dictionary stream.
+ *  Currently, only streams which have been prepared by LZ4_loadDictHC() should
+ *  be expected to work.
+ *
+ *  Alternatively, the provided dictionary stream pointer may be NULL, in which
+ *  case any existing dictionary stream is unset.
+ *
+ *  A dictionary should only be attached to a stream without any history (i.e.,
+ *  a stream that has just been reset).
+ *
+ *  The dictionary will remain attached to the working stream only for the
+ *  current stream session. Calls to LZ4_resetStreamHC(_fast) will remove the
+ *  dictionary context association from the working stream. The dictionary
+ *  stream (and source buffer) must remain in-place / accessible / unchanged
+ *  through the lifetime of the stream session.
+ */
+LZ4LIB_API void
+LZ4_attach_HC_dictionary(LZ4_streamHC_t* working_stream,
+                   const LZ4_streamHC_t* dictionary_stream);
+
+
+/*^**********************************************
+ * !!!!!!   STATIC LINKING ONLY   !!!!!!
+ ***********************************************/
+
+/*-******************************************************************
+ * PRIVATE DEFINITIONS :
+ * Do not use these definitions directly.
+ * They are merely exposed to allow static allocation of `LZ4_streamHC_t`.
+ * Declare an `LZ4_streamHC_t` directly, rather than any type below.
+ * Even then, only do so in the context of static linking, as definitions may change between versions.
+ ********************************************************************/
+
+#define LZ4HC_DICTIONARY_LOGSIZE 16
+#define LZ4HC_MAXD (1<<LZ4HC_DICTIONARY_LOGSIZE)
+#define LZ4HC_MAXD_MASK (LZ4HC_MAXD - 1)
+
+#define LZ4HC_HASH_LOG 15
+#define LZ4HC_HASHTABLESIZE (1 << LZ4HC_HASH_LOG)
+#define LZ4HC_HASH_MASK (LZ4HC_HASHTABLESIZE - 1)
+
+
+/* Never ever use these definitions directly !
+ * Declare or allocate an LZ4_streamHC_t instead.
+**/
+typedef struct LZ4HC_CCtx_internal LZ4HC_CCtx_internal;
+struct LZ4HC_CCtx_internal
+{
+    LZ4_u32 hashTable[LZ4HC_HASHTABLESIZE];
+    LZ4_u16 chainTable[LZ4HC_MAXD];
+    const LZ4_byte* end;     /* next block here to continue on current prefix */
+    const LZ4_byte* prefixStart;  /* Indexes relative to this position */
+    const LZ4_byte* dictStart; /* alternate reference for extDict */
+    LZ4_u32 dictLimit;       /* below that point, need extDict */
+    LZ4_u32 lowLimit;        /* below that point, no more history */
+    LZ4_u32 nextToUpdate;    /* index from which to continue dictionary update */
+    short   compressionLevel;
+    LZ4_i8  favorDecSpeed;   /* favor decompression speed if this flag set,
+                                otherwise, favor compression ratio */
+    LZ4_i8  dirty;           /* stream has to be fully reset if this flag is set */
+    const LZ4HC_CCtx_internal* dictCtx;
+};
+
+#define LZ4_STREAMHC_MINSIZE  262200  /* static size, for inter-version compatibility */
+union LZ4_streamHC_u {
+    char minStateSize[LZ4_STREAMHC_MINSIZE];
+    LZ4HC_CCtx_internal internal_donotuse;
+}; /* previously typedef'd to LZ4_streamHC_t */
+
+/* LZ4_streamHC_t :
+ * This structure allows static allocation of LZ4 HC streaming state.
+ * This can be used to allocate statically on stack, or as part of a larger structure.
+ *
+ * Such state **must** be initialized using LZ4_initStreamHC() before first use.
+ *
+ * Note that invoking LZ4_initStreamHC() is not required when
+ * the state was created using LZ4_createStreamHC() (which is recommended).
+ * Using the normal builder, a newly created state is automatically initialized.
+ *
+ * Static allocation shall only be used in combination with static linking.
+ */
+
+/* LZ4_initStreamHC() : v1.9.0+
+ * Required before first use of a statically allocated LZ4_streamHC_t.
+ * Before v1.9.0 : use LZ4_resetStreamHC() instead
+ */
+LZ4LIB_API LZ4_streamHC_t* LZ4_initStreamHC(void* buffer, size_t size);
+
+
+/*-************************************
+*  Deprecated Functions
+**************************************/
+/* see lz4.h LZ4_DISABLE_DEPRECATE_WARNINGS to turn off deprecation warnings */
+
+/* deprecated compression functions */
+LZ4_DEPRECATED("use LZ4_compress_HC() instead") LZ4LIB_API int LZ4_compressHC               (const char* source, char* dest, int inputSize);
+LZ4_DEPRECATED("use LZ4_compress_HC() instead") LZ4LIB_API int LZ4_compressHC_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize);
+LZ4_DEPRECATED("use LZ4_compress_HC() instead") LZ4LIB_API int LZ4_compressHC2              (const char* source, char* dest, int inputSize, int compressionLevel);
+LZ4_DEPRECATED("use LZ4_compress_HC() instead") LZ4LIB_API int LZ4_compressHC2_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel);
+LZ4_DEPRECATED("use LZ4_compress_HC_extStateHC() instead") LZ4LIB_API int LZ4_compressHC_withStateHC               (void* state, const char* source, char* dest, int inputSize);
+LZ4_DEPRECATED("use LZ4_compress_HC_extStateHC() instead") LZ4LIB_API int LZ4_compressHC_limitedOutput_withStateHC (void* state, const char* source, char* dest, int inputSize, int maxOutputSize);
+LZ4_DEPRECATED("use LZ4_compress_HC_extStateHC() instead") LZ4LIB_API int LZ4_compressHC2_withStateHC              (void* state, const char* source, char* dest, int inputSize, int compressionLevel);
+LZ4_DEPRECATED("use LZ4_compress_HC_extStateHC() instead") LZ4LIB_API int LZ4_compressHC2_limitedOutput_withStateHC(void* state, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel);
+LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") LZ4LIB_API int LZ4_compressHC_continue               (LZ4_streamHC_t* LZ4_streamHCPtr, const char* source, char* dest, int inputSize);
+LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") LZ4LIB_API int LZ4_compressHC_limitedOutput_continue (LZ4_streamHC_t* LZ4_streamHCPtr, const char* source, char* dest, int inputSize, int maxOutputSize);
+
+/* Obsolete streaming functions; degraded functionality; do not use!
+ *
+ * In order to perform streaming compression, these functions depended on data
+ * that is no longer tracked in the state. They have been preserved as well as
+ * possible: using them will still produce a correct output. However, use of
+ * LZ4_slideInputBufferHC() will truncate the history of the stream, rather
+ * than preserve a window-sized chunk of history.
+ */
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+LZ4_DEPRECATED("use LZ4_createStreamHC() instead") LZ4LIB_API void* LZ4_createHC (const char* inputBuffer);
+LZ4_DEPRECATED("use LZ4_freeStreamHC() instead") LZ4LIB_API   int   LZ4_freeHC (void* LZ4HC_Data);
+#endif
+LZ4_DEPRECATED("use LZ4_saveDictHC() instead") LZ4LIB_API     char* LZ4_slideInputBufferHC (void* LZ4HC_Data);
+LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") LZ4LIB_API int LZ4_compressHC2_continue               (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int compressionLevel);
+LZ4_DEPRECATED("use LZ4_compress_HC_continue() instead") LZ4LIB_API int LZ4_compressHC2_limitedOutput_continue (void* LZ4HC_Data, const char* source, char* dest, int inputSize, int maxOutputSize, int compressionLevel);
+LZ4_DEPRECATED("use LZ4_createStreamHC() instead") LZ4LIB_API int   LZ4_sizeofStreamStateHC(void);
+LZ4_DEPRECATED("use LZ4_initStreamHC() instead") LZ4LIB_API  int   LZ4_resetStreamStateHC(void* state, char* inputBuffer);
+
+
+/* LZ4_resetStreamHC() is now replaced by LZ4_initStreamHC().
+ * The intention is to emphasize the difference with LZ4_resetStreamHC_fast(),
+ * which is now the recommended function to start a new stream of blocks,
+ * but cannot be used to initialize a memory segment containing arbitrary garbage data.
+ *
+ * It is recommended to switch to LZ4_initStreamHC().
+ * LZ4_resetStreamHC() will generate deprecation warnings in a future version.
+ */
+LZ4LIB_API void LZ4_resetStreamHC (LZ4_streamHC_t* streamHCPtr, int compressionLevel);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* LZ4_HC_H_19834876238432 */
+
+
+/*-**************************************************
+ * !!!!!     STATIC LINKING ONLY     !!!!!
+ * Following definitions are considered experimental.
+ * They should not be linked from DLL,
+ * as there is no guarantee of API stability yet.
+ * Prototypes will be promoted to "stable" status
+ * after successful usage in real-life scenarios.
+ ***************************************************/
+#ifdef LZ4_HC_STATIC_LINKING_ONLY   /* protection macro */
+#ifndef LZ4_HC_SLO_098092834
+#define LZ4_HC_SLO_098092834
+
+#define LZ4_STATIC_LINKING_ONLY   /* LZ4LIB_STATIC_API */
+#include "lz4.h"
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*! LZ4_setCompressionLevel() : v1.8.0+ (experimental)
+ *  It's possible to change compression level
+ *  between successive invocations of LZ4_compress_HC_continue*()
+ *  for dynamic adaptation.
+ */
+LZ4LIB_STATIC_API void LZ4_setCompressionLevel(
+    LZ4_streamHC_t* LZ4_streamHCPtr, int compressionLevel);
+
+/*! LZ4_favorDecompressionSpeed() : v1.8.2+ (experimental)
+ *  Opt. Parser will favor decompression speed over compression ratio.
+ *  Only applicable to levels >= LZ4HC_CLEVEL_OPT_MIN.
+ */
+LZ4LIB_STATIC_API void LZ4_favorDecompressionSpeed(
+    LZ4_streamHC_t* LZ4_streamHCPtr, int favor);
+
+/*! LZ4_resetStreamHC_fast() : v1.9.0+
+ *  When an LZ4_streamHC_t is known to be in a internally coherent state,
+ *  it can often be prepared for a new compression with almost no work, only
+ *  sometimes falling back to the full, expensive reset that is always required
+ *  when the stream is in an indeterminate state (i.e., the reset performed by
+ *  LZ4_resetStreamHC()).
+ *
+ *  LZ4_streamHCs are guaranteed to be in a valid state when:
+ *  - returned from LZ4_createStreamHC()
+ *  - reset by LZ4_resetStreamHC()
+ *  - memset(stream, 0, sizeof(LZ4_streamHC_t))
+ *  - the stream was in a valid state and was reset by LZ4_resetStreamHC_fast()
+ *  - the stream was in a valid state and was then used in any compression call
+ *    that returned success
+ *  - the stream was in an indeterminate state and was used in a compression
+ *    call that fully reset the state (LZ4_compress_HC_extStateHC()) and that
+ *    returned success
+ *
+ *  Note:
+ *  A stream that was last used in a compression call that returned an error
+ *  may be passed to this function. However, it will be fully reset, which will
+ *  clear any existing history and settings from the context.
+ */
+LZ4LIB_STATIC_API void LZ4_resetStreamHC_fast(
+    LZ4_streamHC_t* LZ4_streamHCPtr, int compressionLevel);
+
+/*! LZ4_compress_HC_extStateHC_fastReset() :
+ *  A variant of LZ4_compress_HC_extStateHC().
+ *
+ *  Using this variant avoids an expensive initialization step. It is only safe
+ *  to call if the state buffer is known to be correctly initialized already
+ *  (see above comment on LZ4_resetStreamHC_fast() for a definition of
+ *  "correctly initialized"). From a high level, the difference is that this
+ *  function initializes the provided state with a call to
+ *  LZ4_resetStreamHC_fast() while LZ4_compress_HC_extStateHC() starts with a
+ *  call to LZ4_resetStreamHC().
+ */
+LZ4LIB_STATIC_API int LZ4_compress_HC_extStateHC_fastReset (
+    void* state,
+    const char* src, char* dst,
+    int srcSize, int dstCapacity,
+    int compressionLevel);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* LZ4_HC_SLO_098092834 */
+#endif   /* LZ4_HC_STATIC_LINKING_ONLY */
diff --git a/vendor/glfw/LICENSE.txt b/vendor/glfw/LICENSE.txt
new file mode 100644
index 000000000..b8c096845
--- /dev/null
+++ b/vendor/glfw/LICENSE.txt
@@ -0,0 +1,22 @@
+Copyright (c) 2002-2006 Marcus Geelnard
+
+Copyright (c) 2006-2019 Camilla Löwy
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+   claim that you wrote the original software. If you use this software
+   in a product, an acknowledgment in the product documentation would
+   be appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not
+   be misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+   distribution.
diff --git a/vendor/glfw/bindings/bindings.odin b/vendor/glfw/bindings/bindings.odin
index 164a8ea2d..81569f177 100644
--- a/vendor/glfw/bindings/bindings.odin
+++ b/vendor/glfw/bindings/bindings.odin
@@ -197,7 +197,12 @@ foreign glfw {
 
 	SetErrorCallback :: proc(cbfun: ErrorProc) -> ErrorProc ---
 
+	// Functions added in 3.4, Linux links against system glfw so we define these as weak to be able
+	// to check at runtime if they are available.
+
+	@(linkage="weak")
 	GetPlatform       :: proc() -> c.int ---
+	@(linkage="weak")
 	PlatformSupported :: proc(platform: c.int) -> b32 ---
 }
 
diff --git a/vendor/glfw/native_linux.odin b/vendor/glfw/native_linux.odin
index 6833d2893..acae8a27e 100644
--- a/vendor/glfw/native_linux.odin
+++ b/vendor/glfw/native_linux.odin
@@ -13,7 +13,13 @@ foreign {
 	SetX11SelectionString :: proc(string:  cstring) ---
 	GetX11SelectionString :: proc() -> cstring ---
 
+	// Functions added in 3.4, Linux links against system glfw so we define these as weak to be able
+	// to check at runtime if they are available.
+
+	@(linkage="weak")
 	GetWaylandDisplay :: proc()                       -> rawptr /* struct wl_display* */ ---
+	@(linkage="weak")
 	GetWaylandWindow  :: proc(window:  WindowHandle)  -> rawptr /* struct wl_surface* */ ---
+	@(linkage="weak")
 	GetWaylandMonitor :: proc(monitor: MonitorHandle) -> rawptr /* struct wl_output*  */ ---
 }
diff --git a/vendor/lua/5.1/lua.odin b/vendor/lua/5.1/lua.odin
index b53c61bb3..5b7482931 100644
--- a/vendor/lua/5.1/lua.odin
+++ b/vendor/lua/5.1/lua.odin
@@ -16,7 +16,7 @@ when LUA_SHARED {
 	} else when ODIN_OS == .Linux {
 		foreign import lib "linux/liblua5.1.so"
 	} else {
-		foreign import lib "system:liblua.so.5.1"
+		foreign import lib "system:lua5.1"
 	}
 } else {
 	when ODIN_OS == .Windows {
@@ -24,7 +24,7 @@ when LUA_SHARED {
 	} else when ODIN_OS == .Linux {
 		foreign import lib "linux/liblua5.1.a"
 	} else {
-		foreign import lib "system:liblua5.1.a"
+		foreign import lib "system:lua5.1"
 	}
 }
 
diff --git a/vendor/lua/5.2/lua.odin b/vendor/lua/5.2/lua.odin
index 5474da95d..d5d8ec253 100644
--- a/vendor/lua/5.2/lua.odin
+++ b/vendor/lua/5.2/lua.odin
@@ -16,7 +16,7 @@ when LUA_SHARED {
 	} else when ODIN_OS == .Linux {
 		foreign import lib "linux/liblua52.so"
 	} else {
-		foreign import lib "system:liblua.so.5.2"
+		foreign import lib "system:lua5.2"
 	}
 } else {
 	when ODIN_OS == .Windows {
@@ -24,7 +24,7 @@ when LUA_SHARED {
 	} else when ODIN_OS == .Linux {
 		foreign import lib "linux/liblua52.a"
 	} else {
-		foreign import lib "system:liblua52.a"
+		foreign import lib "system:lua5.2"
 	}
 }
 
diff --git a/vendor/lua/5.3/lua.odin b/vendor/lua/5.3/lua.odin
index e0975e5f8..47215a327 100644
--- a/vendor/lua/5.3/lua.odin
+++ b/vendor/lua/5.3/lua.odin
@@ -16,7 +16,7 @@ when LUA_SHARED {
 	} else when ODIN_OS == .Linux {
 		foreign import lib "linux/liblua53.so"
 	} else {
-		foreign import lib "system:liblua.so.5.3"
+		foreign import lib "system:lua5.3"
 	}
 } else {
 	when ODIN_OS == .Windows {
@@ -24,7 +24,7 @@ when LUA_SHARED {
 	} else when ODIN_OS == .Linux {
 		foreign import lib "linux/liblua53.a"
 	} else {
-		foreign import lib "system:liblua53.a"
+		foreign import lib "system:lua5.3"
 	}
 }
 
diff --git a/vendor/lua/5.4/lua.odin b/vendor/lua/5.4/lua.odin
index 80f7ead3a..9be8fea55 100644
--- a/vendor/lua/5.4/lua.odin
+++ b/vendor/lua/5.4/lua.odin
@@ -16,15 +16,7 @@ when LUA_SHARED {
 	} else when ODIN_OS == .Linux {
 		foreign import lib "linux/liblua54.so"
 	} else {
-		// Note(bumbread): My linux system has a few aliases for this shared object
-		//   lublua5.4.so, liblua.so, lublua.so.5.4, liblua.so.5.4.6. I don't know
-		// who enforces these numbers (probably ld?), and if it can be done in a
-		// unix-generic way, but in any way I think the most sane thing to do is to
-		// keep it close to what linux does and if it breaks, just special case those
-		// operating systems.
-		// Also there was no alias for liblua54.so, that seems to suggest that way
-		// of specifying it isn't portable
-		foreign import lib "system:liblua.so.5.4"
+		foreign import lib "system:lua5.4"
 	}
 } else {
 	when ODIN_OS == .Windows {
@@ -32,7 +24,7 @@ when LUA_SHARED {
 	} else when ODIN_OS == .Linux {
 		foreign import lib "linux/liblua54.a"
 	} else {
-		foreign import lib "system:liblua54.a"
+		foreign import lib "system:lua5.4"
 	}
 }
 
diff --git a/vendor/lua/README.md b/vendor/lua/README.md
index 8f4b0f5a5..4bc7804bb 100644
--- a/vendor/lua/README.md
+++ b/vendor/lua/README.md
@@ -1,12 +1,50 @@
 # Lua in Odin
 
-```odin
-import lua "vendor:lua/5.4" // or whatever version you want
-```
-
 Lua packages
 
 * `vendor:lua/5.1` (version 5.1.5)
 * `vendor:lua/5.2` (version 5.2.4)
 * `vendor:lua/5.3` (version 5.3.6)
-* `vendor:lua/5.4` (version 5.4.2)
\ No newline at end of file
+* `vendor:lua/5.4` (version 5.4.2)
+
+With custom context-based allocator:
+
+```odin
+package lua_example_with_context
+
+import "core:fmt"
+import lua "vendor:lua/5.4" // or whatever version you want
+import "core:c"
+import "base:runtime"
+
+state: ^lua.State
+
+lua_allocator :: proc "c" (ud: rawptr, ptr: rawptr, osize, nsize: c.size_t) -> (buf: rawptr) {
+	old_size := int(osize)
+	new_size := int(nsize)
+	context = (^runtime.Context)(ud)^
+
+	if ptr == nil {
+		data, err := runtime.mem_alloc(new_size)
+		return raw_data(data) if err == .None else nil
+	} else {
+		if nsize > 0 {
+			data, err := runtime.mem_resize(ptr, old_size, new_size)
+			return raw_data(data) if err == .None else nil
+		} else {
+			runtime.mem_free(ptr)
+			return
+		}
+	}
+}
+
+main :: proc() {
+	_context := context
+	state = lua.newstate(lua_allocator, &_context)
+	defer lua.close(state)
+
+	lua.L_dostring(state, "return 'somestring'")
+	str := lua.tostring(state, -1)
+	fmt.println(str)
+}
+```
\ No newline at end of file
diff --git a/vendor/raylib/raylib.odin b/vendor/raylib/raylib.odin
index 3d1b74058..cff590b7f 100644
--- a/vendor/raylib/raylib.odin
+++ b/vendor/raylib/raylib.odin
@@ -1015,8 +1015,8 @@ foreign lib {
 
 	SetRandomSeed  		 :: proc(seed: c.uint) ---                      // Set the seed for the random number generator
 	GetRandomValue 		 :: proc(min, max: c.int) -> c.int ---          // Get a random value between min and max (both included)
-	LoadRandomSequence 	 :: proc(count : c.uint, min, max: c.int) --- 	// Load random values sequence, no values repeated
-	UnloadRandomSequence :: proc(sequence : ^c.int) ---             	// Unload random values sequence
+	LoadRandomSequence 	 :: proc(count: c.uint, min, max: c.int) --- 	// Load random values sequence, no values repeated
+	UnloadRandomSequence     :: proc(sequence: ^c.int) ---                  // Unload random values sequence
 
 	// Misc. functions
 	TakeScreenshot :: proc(fileName: cstring) ---        // Takes a screenshot of current screen (filename extension defines format)
@@ -1424,9 +1424,9 @@ foreign lib {
 
 	LoadUTF8             :: proc(codepoints: [^]rune, length: c.int) -> [^]byte --- // Load UTF-8 text encoded from codepoints array
 	UnloadUTF8           :: proc(text: [^]byte) ---                                 // Unload UTF-8 text encoded from codepoints array
-	LoadCodepoints       :: proc(text: rawptr, count: ^c.int) -> [^]rune ---        // Load all codepoints from a UTF-8 text string, codepoints count returned by parameter
+	LoadCodepoints       :: proc(text: cstring, count: ^c.int) -> [^]rune ---       // Load all codepoints from a UTF-8 text string, codepoints count returned by parameter
 	UnloadCodepoints     :: proc(codepoints: [^]rune) ---                           // Unload codepoints data from memory
-	GetCodepointCount    :: proc(text : cstring) -> c.int ---                       // Get total number of codepoints in a UTF-8 encoded string
+	GetCodepointCount    :: proc(text: cstring) -> c.int ---                        // Get total number of codepoints in a UTF-8 encoded string
 	GetCodepoint         :: proc(text: cstring, codepointSize: ^c.int) -> rune ---  // Get next codepoint in a UTF-8 encoded string, 0x3f('?') is returned on failure
 	GetCodepointNext     :: proc(text: cstring, codepointSize: ^c.int) -> rune ---  // Get next codepoint in a UTF-8 encoded string, 0x3f('?') is returned on failure
 	GetCodepointPrevious :: proc(text: cstring, codepointSize: ^c.int) -> rune ---  // Get previous codepoint in a UTF-8 encoded string, 0x3f('?') is returned on failure
@@ -1667,7 +1667,7 @@ IsGestureDetected :: proc "c" (gesture: Gesture) -> bool {
 
 
 // Text formatting with variables (sprintf style)
-TextFormat :: proc(text: cstring, args: ..any) -> cstring { 
+TextFormat :: proc(text: cstring, args: ..any) -> cstring {
 	@static buffers: [MAX_TEXTFORMAT_BUFFERS][MAX_TEXT_BUFFER_LENGTH]byte
 	@static index: u32
 	
@@ -1683,7 +1683,7 @@ TextFormat :: proc(text: cstring, args: ..any) -> cstring {
 }
 
 // Text formatting with variables (sprintf style) and allocates (must be freed with 'MemFree')
-TextFormatAlloc :: proc(text: cstring, args: ..any) -> cstring { 
+TextFormatAlloc :: proc(text: cstring, args: ..any) -> cstring {
 	str := fmt.tprintf(string(text), ..args)
 	return strings.clone_to_cstring(str, MemAllocator())
 }
diff --git a/vendor/stb/truetype/stb_truetype.odin b/vendor/stb/truetype/stb_truetype.odin
index 6993cd2b7..e6defff5f 100644
--- a/vendor/stb/truetype/stb_truetype.odin
+++ b/vendor/stb/truetype/stb_truetype.odin
@@ -568,7 +568,7 @@ foreign stbtt {
 	// some of the values for the IDs are below; for more see the truetype spec:
 	//     http://developer.apple.com/textfonts/TTRefMan/RM06/Chap6name.html
 	//     http://www.microsoft.com/typography/otspec/name.htm
-	GetFontNameString :: proc(font: ^fontinfo, length: c.int, platformID: PLATFORM_ID, encodingID, languageID, nameID: c.int) -> cstring ---
+	GetFontNameString :: proc(font: ^fontinfo, length: ^c.int, platformID: PLATFORM_ID, encodingID, languageID, nameID: c.int) -> cstring ---
 }
 
 
diff --git a/vendor/wgpu/README.md b/vendor/wgpu/README.md
index 3561642f4..59b31567f 100644
--- a/vendor/wgpu/README.md
+++ b/vendor/wgpu/README.md
@@ -41,8 +41,14 @@ It exports one procedure `GetSurface(wgpu.Instance, glfw.WindowHandle) -> glfw.S
 The procedure will call the needed target specific procedures and return a surface configured
 for the given window.
 
-To support Wayland on Linux, you need to have GLFW compiled to support it, and use
-`-define:WGPU_GFLW_GLUE_SUPPORT_WAYLAND=true` to enable the package to check for Wayland.
-
 Do note that wgpu does not require GLFW, you can use native windows or another windowing library too.
 For that you can take inspiration from `glfwglue` on glueing them together.
+
+### Wayland
+
+GLFW supports Wayland from version 3.4 onwards and only if it is compiled with `-DGLFW_EXPOSE_NATIVE_WAYLAND`.
+
+Odin links against your system's glfw library (probably installed through a package manager).
+If that version is lower than 3.4 or hasn't been compiled with the previously mentioned define,
+you will have to compile glfw from source yourself and adjust the `foreign import` declarations in `vendor:glfw/bindings` to
+point to it.
diff --git a/vendor/wgpu/glfwglue/glue_linux.odin b/vendor/wgpu/glfwglue/glue_linux.odin
index 35c36a37d..45d29a638 100644
--- a/vendor/wgpu/glfwglue/glue_linux.odin
+++ b/vendor/wgpu/glfwglue/glue_linux.odin
@@ -3,11 +3,8 @@ package wgpu_glfw_glue
 import "vendor:glfw"
 import "vendor:wgpu"
 
-// GLFW needs to be compiled with wayland support for this to work.
-SUPPORT_WAYLAND :: #config(WGPU_GFLW_GLUE_SUPPORT_WAYLAND, false)
-
 GetSurface :: proc(instance: wgpu.Instance, window: glfw.WindowHandle) -> wgpu.Surface {
-	when SUPPORT_WAYLAND {
+	if glfw.GetPlatform != nil {
 		if glfw.GetPlatform() == glfw.PLATFORM_WAYLAND {
 			display := glfw.GetWaylandDisplay()
 			surface := glfw.GetWaylandWindow(window)
@@ -24,6 +21,10 @@ GetSurface :: proc(instance: wgpu.Instance, window: glfw.WindowHandle) -> wgpu.S
 				},
 			)
 		}
+
+		if glfw.GetPlatform() != glfw.PLATFORM_X11 {
+			panic("wgpu glfw glue: unsupported platform, expected Wayland or X11")
+		}
 	}
 
 	display := glfw.GetX11Display()
diff --git a/vendor/wgpu/wgpu.odin b/vendor/wgpu/wgpu.odin
index 74df83fde..4efe572cf 100644
--- a/vendor/wgpu/wgpu.odin
+++ b/vendor/wgpu/wgpu.odin
@@ -1676,7 +1676,7 @@ when ODIN_OS != .JS {
 
 		GetVersion :: proc() -> u32 ---
 
-		RenderPassEncoderSetPushConstants :: proc(encoder: RenderPassEncoder, stages: ShaderStageFlags, offset: u32, sizeBytes: u32, data: cstring) ---
+		RenderPassEncoderSetPushConstants :: proc(encoder: RenderPassEncoder, stages: ShaderStageFlags, offset: u32, sizeBytes: u32, data: rawptr) ---
 
 		RenderPassEncoderMultiDrawIndirect :: proc(encoder: RenderPassEncoder, buffer: Buffer, offset: u64, count: u32) ---
 		RenderPassEncoderMultiDrawIndexedIndirect :: proc(encoder: RenderPassEncoder, buffer: Buffer, offset: u64, count: u32) ---
diff --git a/vendor/wgpu/wgpu_js.odin b/vendor/wgpu/wgpu_js.odin
index f375a0d69..3c8375adb 100644
--- a/vendor/wgpu/wgpu_js.odin
+++ b/vendor/wgpu/wgpu_js.odin
@@ -22,5 +22,6 @@ wgpu_alloc :: proc "contextless" (size: i32) -> [^]byte {
 @(private="file", export)
 wgpu_free :: proc "contextless" (ptr: rawptr) {
 	context = g_context
-	assert(free(ptr) == nil, "wgpu_free failed")
+	err := free(ptr)
+	assert(err == nil, "wgpu_free failed")
 }
diff --git a/vendor/x11/xlib/xlib_procs.odin b/vendor/x11/xlib/xlib_procs.odin
index 17d172172..207b3f6bc 100644
--- a/vendor/x11/xlib/xlib_procs.odin
+++ b/vendor/x11/xlib/xlib_procs.odin
@@ -49,7 +49,7 @@ foreign xlib {
 	DisplayString     :: proc(display: ^Display) -> cstring ---
 	// Display macros (defaults)
 	DefaultColormap   :: proc(display: ^Display, screen_no: i32) -> Colormap ---
-	DefaultDepth      :: proc(display: ^Display) -> i32 ---
+	DefaultDepth      :: proc(display: ^Display, screen_no: i32) -> i32 ---
 	DefaultGC         :: proc(display: ^Display, screen_no: i32) -> GC ---
 	DefaultRootWindow :: proc(display: ^Display) -> Window ---
 	DefaultScreen     :: proc(display: ^Display) -> i32 ---
@@ -138,8 +138,8 @@ foreign xlib {
 		width:     u32,
 		height:    u32,
 		bordersz:  u32,
-		border:    int,
-		bg:        int,
+		border:    uint,
+		bg:        uint,
 		) -> Window ---
 	DestroyWindow     :: proc(display: ^Display, window: Window) ---
 	DestroySubwindows :: proc(display: ^Display, window: Window) ---