diff --git a/core/crypto/_aes/aes.odin b/core/crypto/_aes/aes.odin index 4f52485d2..f458a12fb 100644 --- a/core/crypto/_aes/aes.odin +++ b/core/crypto/_aes/aes.odin @@ -25,4 +25,5 @@ GHASH_BLOCK_SIZE :: 16 GHASH_TAG_SIZE :: 16 // RCON is the AES keyschedule round constants. +@(rodata) RCON := [10]byte{0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36} diff --git a/core/crypto/_aes/ct64/ct64.odin b/core/crypto/_aes/ct64/ct64.odin index f198cab81..af2b42c1e 100644 --- a/core/crypto/_aes/ct64/ct64.odin +++ b/core/crypto/_aes/ct64/ct64.odin @@ -22,8 +22,6 @@ package aes_ct64 -import "base:intrinsics" - // Bitsliced AES for 64-bit general purpose (integer) registers. Each // invocation will process up to 4 blocks at a time. This implementation // is derived from the BearSSL ct64 code, and distributed under a 1-clause @@ -212,11 +210,8 @@ orthogonalize :: proc "contextless" (q: ^[8]u64) { } @(require_results) -interleave_in :: proc "contextless" (w: []u32) -> (q0, q1: u64) #no_bounds_check { - if len(w) < 4 { - intrinsics.trap() - } - x0, x1, x2, x3 := u64(w[0]), u64(w[1]), u64(w[2]), u64(w[3]) +interleave_in :: proc "contextless" (w0, w1, w2, w3: u32) -> (q0, q1: u64) #no_bounds_check { + x0, x1, x2, x3 := u64(w0), u64(w1), u64(w2), u64(w3) x0 |= (x0 << 16) x1 |= (x1 << 16) x2 |= (x2 << 16) diff --git a/core/crypto/_aes/ct64/ct64_enc.odin b/core/crypto/_aes/ct64/ct64_enc.odin index 36d4aebc8..bee6de722 100644 --- a/core/crypto/_aes/ct64/ct64_enc.odin +++ b/core/crypto/_aes/ct64/ct64_enc.odin @@ -22,12 +22,8 @@ package aes_ct64 -import "base:intrinsics" - add_round_key :: proc "contextless" (q: ^[8]u64, sk: []u64) #no_bounds_check { - if len(sk) < 8 { - intrinsics.trap() - } + ensure_contextless(len(sk) >= 8, "aes/ct64: invalid round key size") q[0] ~= sk[0] q[1] ~= sk[1] diff --git a/core/crypto/_aes/ct64/ct64_keysched.odin b/core/crypto/_aes/ct64/ct64_keysched.odin index 060a2c03e..0f00bba57 100644 --- a/core/crypto/_aes/ct64/ct64_keysched.odin +++ b/core/crypto/_aes/ct64/ct64_keysched.odin @@ -22,7 +22,6 @@ package aes_ct64 -import "base:intrinsics" import "core:crypto/_aes" import "core:encoding/endian" import "core:mem" @@ -42,7 +41,7 @@ sub_word :: proc "contextless" (x: u32) -> u32 { } @(private, require_results) -keysched :: proc(comp_skey: []u64, key: []byte) -> int { +keysched :: proc "contextless" (comp_skey: []u64, key: []byte) -> int { num_rounds, key_len := 0, len(key) switch key_len { case _aes.KEY_SIZE_128: @@ -52,7 +51,7 @@ keysched :: proc(comp_skey: []u64, key: []byte) -> int { case _aes.KEY_SIZE_256: num_rounds = _aes.ROUNDS_256 case: - panic("crypto/aes: invalid AES key size") + panic_contextless("crypto/aes: invalid AES key size") } skey: [60]u32 = --- @@ -78,7 +77,7 @@ keysched :: proc(comp_skey: []u64, key: []byte) -> int { q: [8]u64 = --- for i, j := 0, 0; i < nkf; i, j = i + 4, j + 2 { - q[0], q[4] = interleave_in(skey[i:]) + q[0], q[4] = interleave_in(skey[i], skey[i+1], skey[i+2], skey[i+3]) q[1] = q[0] q[2] = q[0] q[3] = q[0] @@ -123,57 +122,3 @@ skey_expand :: proc "contextless" (skey, comp_skey: []u64, num_rounds: int) { skey[v + 3] = (x3 << 4) - x3 } } - -orthogonalize_roundkey :: proc "contextless" (qq: []u64, key: []byte) { - if len(qq) < 8 || len(key) != 16 { - intrinsics.trap() - } - - skey: [4]u32 = --- - skey[0] = endian.unchecked_get_u32le(key[0:]) - skey[1] = endian.unchecked_get_u32le(key[4:]) - skey[2] = endian.unchecked_get_u32le(key[8:]) - skey[3] = endian.unchecked_get_u32le(key[12:]) - - q: [8]u64 = --- - q[0], q[4] = interleave_in(skey[:]) - q[1] = q[0] - q[2] = q[0] - q[3] = q[0] - q[5] = q[4] - q[6] = q[4] - q[7] = q[4] - orthogonalize(&q) - - comp_skey: [2]u64 = --- - comp_skey[0] = - (q[0] & 0x1111111111111111) | - (q[1] & 0x2222222222222222) | - (q[2] & 0x4444444444444444) | - (q[3] & 0x8888888888888888) - comp_skey[1] = - (q[4] & 0x1111111111111111) | - (q[5] & 0x2222222222222222) | - (q[6] & 0x4444444444444444) | - (q[7] & 0x8888888888888888) - - for x, u in comp_skey { - x0 := x - x1, x2, x3 := x0, x0, x0 - x0 &= 0x1111111111111111 - x1 &= 0x2222222222222222 - x2 &= 0x4444444444444444 - x3 &= 0x8888888888888888 - x1 >>= 1 - x2 >>= 2 - x3 >>= 3 - qq[u * 4 + 0] = (x0 << 4) - x0 - qq[u * 4 + 1] = (x1 << 4) - x1 - qq[u * 4 + 2] = (x2 << 4) - x2 - qq[u * 4 + 3] = (x3 << 4) - x3 - } - - mem.zero_explicit(&skey, size_of(skey)) - mem.zero_explicit(&q, size_of(q)) - mem.zero_explicit(&comp_skey, size_of(comp_skey)) -} diff --git a/core/crypto/_aes/ct64/ghash.odin b/core/crypto/_aes/ct64/ghash.odin index a522a481a..0c885d8ba 100644 --- a/core/crypto/_aes/ct64/ghash.odin +++ b/core/crypto/_aes/ct64/ghash.odin @@ -22,7 +22,6 @@ package aes_ct64 -import "base:intrinsics" import "core:crypto/_aes" import "core:encoding/endian" @@ -64,9 +63,8 @@ rev64 :: proc "contextless" (x: u64) -> u64 { // Note: `dst` is both an input and an output, to support easy implementation // of GCM. ghash :: proc "contextless" (dst, key, data: []byte) { - if len(dst) != _aes.GHASH_BLOCK_SIZE || len(key) != _aes.GHASH_BLOCK_SIZE { - intrinsics.trap() - } + ensure_contextless(len(dst) == _aes.GHASH_BLOCK_SIZE) + ensure_contextless(len(key) == _aes.GHASH_BLOCK_SIZE) buf := data l := len(buf) diff --git a/core/crypto/_aes/ct64/helpers.odin b/core/crypto/_aes/ct64/helpers.odin index 169271f6d..7eec5bdc4 100644 --- a/core/crypto/_aes/ct64/helpers.odin +++ b/core/crypto/_aes/ct64/helpers.odin @@ -1,60 +1,61 @@ package aes_ct64 -import "base:intrinsics" import "core:crypto/_aes" import "core:encoding/endian" -load_blockx1 :: proc "contextless" (q: ^[8]u64, src: []byte) { - if len(src) != _aes.BLOCK_SIZE { - intrinsics.trap() - } - - w: [4]u32 = --- - w[0] = endian.unchecked_get_u32le(src[0:]) - w[1] = endian.unchecked_get_u32le(src[4:]) - w[2] = endian.unchecked_get_u32le(src[8:]) - w[3] = endian.unchecked_get_u32le(src[12:]) - q[0], q[4] = interleave_in(w[:]) - orthogonalize(q) +@(require_results) +load_interleaved :: proc "contextless" (src: []byte) -> (u64, u64) #no_bounds_check { + w0 := endian.unchecked_get_u32le(src[0:]) + w1 := endian.unchecked_get_u32le(src[4:]) + w2 := endian.unchecked_get_u32le(src[8:]) + w3 := endian.unchecked_get_u32le(src[12:]) + return interleave_in(w0, w1, w2, w3) } -store_blockx1 :: proc "contextless" (dst: []byte, q: ^[8]u64) { - if len(dst) != _aes.BLOCK_SIZE { - intrinsics.trap() - } - - orthogonalize(q) - w0, w1, w2, w3 := interleave_out(q[0], q[4]) +store_interleaved :: proc "contextless" (dst: []byte, a0, a1: u64) #no_bounds_check { + w0, w1, w2, w3 := interleave_out(a0, a1) endian.unchecked_put_u32le(dst[0:], w0) endian.unchecked_put_u32le(dst[4:], w1) endian.unchecked_put_u32le(dst[8:], w2) endian.unchecked_put_u32le(dst[12:], w3) } +@(require_results) +xor_interleaved :: #force_inline proc "contextless" (a0, a1, b0, b1: u64) -> (u64, u64) { + return a0 ~ b0, a1 ~ b1 +} + +@(require_results) +and_interleaved :: #force_inline proc "contextless" (a0, a1, b0, b1: u64) -> (u64, u64) { + return a0 & b0, a1 & b1 +} + +load_blockx1 :: proc "contextless" (q: ^[8]u64, src: []byte) { + ensure_contextless(len(src) == _aes.BLOCK_SIZE, "aes/ct64: invalid block size") + + q[0], q[4] = #force_inline load_interleaved(src) + orthogonalize(q) +} + +store_blockx1 :: proc "contextless" (dst: []byte, q: ^[8]u64) { + ensure_contextless(len(dst) == _aes.BLOCK_SIZE, "aes/ct64: invalid block size") + + orthogonalize(q) + #force_inline store_interleaved(dst, q[0], q[4]) +} + load_blocks :: proc "contextless" (q: ^[8]u64, src: [][]byte) { - if n := len(src); n > STRIDE || n == 0 { - intrinsics.trap() - } + ensure_contextless(len(src) == 0 || len(src) <= STRIDE, "aes/ct64: invalid block(s) size") - w: [4]u32 = --- for s, i in src { - if len(s) != _aes.BLOCK_SIZE { - intrinsics.trap() - } - - w[0] = endian.unchecked_get_u32le(s[0:]) - w[1] = endian.unchecked_get_u32le(s[4:]) - w[2] = endian.unchecked_get_u32le(s[8:]) - w[3] = endian.unchecked_get_u32le(s[12:]) - q[i], q[i + 4] = interleave_in(w[:]) + ensure_contextless(len(s) == _aes.BLOCK_SIZE, "aes/ct64: invalid block size") + q[i], q[i + 4] = #force_inline load_interleaved(s) } orthogonalize(q) } store_blocks :: proc "contextless" (dst: [][]byte, q: ^[8]u64) { - if n := len(dst); n > STRIDE || n == 0 { - intrinsics.trap() - } + ensure_contextless(len(dst) == 0 || len(dst) <= STRIDE, "aes/ct64: invalid block(s) size") orthogonalize(q) for d, i in dst { @@ -62,14 +63,7 @@ store_blocks :: proc "contextless" (dst: [][]byte, q: ^[8]u64) { if d == nil { break } - if len(d) != _aes.BLOCK_SIZE { - intrinsics.trap() - } - - w0, w1, w2, w3 := interleave_out(q[i], q[i + 4]) - endian.unchecked_put_u32le(d[0:], w0) - endian.unchecked_put_u32le(d[4:], w1) - endian.unchecked_put_u32le(d[8:], w2) - endian.unchecked_put_u32le(d[12:], w3) + ensure_contextless(len(d) == _aes.BLOCK_SIZE, "aes/ct64: invalid block size") + #force_inline store_interleaved(d, q[i], q[i + 4]) } } diff --git a/core/crypto/_aes/hw_intel/ghash.odin b/core/crypto/_aes/hw_intel/ghash.odin index 4320dd59b..5f51b614b 100644 --- a/core/crypto/_aes/hw_intel/ghash.odin +++ b/core/crypto/_aes/hw_intel/ghash.odin @@ -52,7 +52,7 @@ GHASH_STRIDE_BYTES_HW :: GHASH_STRIDE_HW * _aes.GHASH_BLOCK_SIZE // that it is right-shifted by 1 bit. The left-shift is relatively // inexpensive, and it can be mutualised. // -// Since SSE2 opcodes do not have facilities for shitfting full 128-bit +// Since SSE2 opcodes do not have facilities for shifting full 128-bit // values with bit precision, we have to break down values into 64-bit // chunks. We number chunks from 0 to 3 in left to right order. @@ -155,7 +155,7 @@ square_f128 :: #force_inline proc "contextless" (kw: x86.__m128i) -> (x86.__m128 @(enable_target_feature = "sse2,ssse3,pclmul") ghash :: proc "contextless" (dst, key, data: []byte) #no_bounds_check { if len(dst) != _aes.GHASH_BLOCK_SIZE || len(key) != _aes.GHASH_BLOCK_SIZE { - intrinsics.trap() + panic_contextless("aes/ghash: invalid dst or key size") } // Note: BearSSL opts to copy the remainder into a zero-filled diff --git a/core/crypto/_blake2/blake2.odin b/core/crypto/_blake2/blake2.odin index 2ad74843b..89fbe3a7a 100644 --- a/core/crypto/_blake2/blake2.odin +++ b/core/crypto/_blake2/blake2.odin @@ -18,6 +18,8 @@ BLAKE2S_SIZE :: 32 BLAKE2B_BLOCK_SIZE :: 128 BLAKE2B_SIZE :: 64 +MAX_SIZE :: 255 + Blake2s_Context :: struct { h: [8]u32, t: [2]u32, @@ -68,13 +70,13 @@ Blake2_Tree :: struct { is_last_node: bool, } -@(private) +@(private, rodata) BLAKE2S_IV := [8]u32 { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19, } -@(private) +@(private, rodata) BLAKE2B_IV := [8]u64 { 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, @@ -82,16 +84,13 @@ BLAKE2B_IV := [8]u64 { 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, } -init :: proc(ctx: ^$T, cfg: ^Blake2_Config) { +init :: proc "contextless" (ctx: ^$T, cfg: ^Blake2_Config) { when T == Blake2s_Context { max_size :: BLAKE2S_SIZE } else when T == Blake2b_Context { max_size :: BLAKE2B_SIZE } - - if cfg.size > max_size { - panic("blake2: requested output size exceeeds algorithm max") - } + ensure_contextless(cfg.size <= max_size, "blake2: requested output size exceeeds algorithm max") // To save having to allocate a scratch buffer, use the internal // data buffer (`ctx.x`), as it is exactly the correct size. @@ -167,8 +166,8 @@ init :: proc(ctx: ^$T, cfg: ^Blake2_Config) { ctx.is_initialized = true } -update :: proc(ctx: ^$T, p: []byte) { - assert(ctx.is_initialized) +update :: proc "contextless" (ctx: ^$T, p: []byte) { + ensure_contextless(ctx.is_initialized) p := p when T == Blake2s_Context { @@ -195,8 +194,8 @@ update :: proc(ctx: ^$T, p: []byte) { ctx.nx += copy(ctx.x[ctx.nx:], p) } -final :: proc(ctx: ^$T, hash: []byte, finalize_clone: bool = false) { - assert(ctx.is_initialized) +final :: proc "contextless" (ctx: ^$T, hash: []byte, finalize_clone: bool = false) { + ensure_contextless(ctx.is_initialized) ctx := ctx if finalize_clone { @@ -206,24 +205,19 @@ final :: proc(ctx: ^$T, hash: []byte, finalize_clone: bool = false) { } defer(reset(ctx)) + ensure_contextless(len(hash) >= int(ctx.size), "crypto/blake2: invalid destination digest size") when T == Blake2s_Context { - if len(hash) < int(ctx.size) { - panic("crypto/blake2s: invalid destination digest size") - } blake2s_final(ctx, hash) } else when T == Blake2b_Context { - if len(hash) < int(ctx.size) { - panic("crypto/blake2b: invalid destination digest size") - } blake2b_final(ctx, hash) } } -clone :: proc(ctx, other: ^$T) { +clone :: proc "contextless" (ctx, other: ^$T) { ctx^ = other^ } -reset :: proc(ctx: ^$T) { +reset :: proc "contextless" (ctx: ^$T) { if !ctx.is_initialized { return } diff --git a/core/crypto/_chacha20/chacha20.odin b/core/crypto/_chacha20/chacha20.odin index a907209de..1a4b5a507 100644 --- a/core/crypto/_chacha20/chacha20.odin +++ b/core/crypto/_chacha20/chacha20.odin @@ -1,6 +1,5 @@ package _chacha20 -import "base:intrinsics" import "core:encoding/endian" import "core:math/bits" import "core:mem" @@ -46,9 +45,8 @@ Context :: struct { // derivation is expected to be handled by the caller, so that the // HChaCha call can be suitably accelerated. init :: proc "contextless" (ctx: ^Context, key, iv: []byte, is_xchacha: bool) { - if len(key) != KEY_SIZE || len(iv) != IV_SIZE { - intrinsics.trap() - } + ensure_contextless(len(key) == KEY_SIZE, "chacha20: invalid key size") + ensure_contextless(len(iv) == IV_SIZE, "chacha20: invalid key size") k, n := key, iv @@ -76,12 +74,10 @@ init :: proc "contextless" (ctx: ^Context, key, iv: []byte, is_xchacha: bool) { // seek seeks the (X)ChaCha20 stream counter to the specified block. seek :: proc(ctx: ^Context, block_nr: u64) { - assert(ctx._is_initialized) + ensure(ctx._is_initialized) if ctx._is_ietf_flavor { - if block_nr > MAX_CTR_IETF { - panic("crypto/chacha20: attempted to seek past maximum counter") - } + ensure(block_nr <= MAX_CTR_IETF, "crypto/chacha20: attempted to seek past maximum counter") } else { ctx._s[13] = u32(block_nr >> 32) } @@ -102,7 +98,7 @@ check_counter_limit :: proc(ctx: ^Context, nr_blocks: int) { // Enforce the maximum consumed keystream per IV. // // While all modern "standard" definitions of ChaCha20 use - // the IETF 32-bit counter, for XChaCha20 most common + // the IETF 32-bit counter, for XChaCha20 historical // implementations allow for a 64-bit counter. // // Honestly, the answer here is "use a MRAE primitive", but @@ -110,14 +106,14 @@ check_counter_limit :: proc(ctx: ^Context, nr_blocks: int) { ERR_CTR_EXHAUSTED :: "crypto/chacha20: maximum (X)ChaCha20 keystream per IV reached" + ctr_ok: bool if ctx._is_ietf_flavor { - if u64(ctx._s[12]) + u64(nr_blocks) > MAX_CTR_IETF { - panic(ERR_CTR_EXHAUSTED) - } + ctr_ok = u64(ctx._s[12]) + u64(nr_blocks) <= MAX_CTR_IETF } else { ctr := (u64(ctx._s[13]) << 32) | u64(ctx._s[12]) - if _, carry := bits.add_u64(ctr, u64(nr_blocks), 0); carry != 0 { - panic(ERR_CTR_EXHAUSTED) - } + _, carry := bits.add_u64(ctr, u64(nr_blocks), 0) + ctr_ok = carry == 0 } + + ensure(ctr_ok, "crypto/chacha20: maximum (X)ChaCha20 keystream per IV reached") } diff --git a/core/crypto/_chacha20/simd128/chacha20_simd128.odin b/core/crypto/_chacha20/simd128/chacha20_simd128.odin index fe0d0d518..cf78541d1 100644 --- a/core/crypto/_chacha20/simd128/chacha20_simd128.odin +++ b/core/crypto/_chacha20/simd128/chacha20_simd128.odin @@ -29,11 +29,24 @@ when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 { // explicitly using simd.u8x16 shuffles. @(private = "file") TARGET_SIMD_FEATURES :: "sse2,ssse3" +} else when ODIN_ARCH == .riscv64 { + @(private = "file") + TARGET_SIMD_FEATURES :: "v" } else { @(private = "file") TARGET_SIMD_FEATURES :: "" } +// Some targets lack runtime feature detection, and will flat out refuse +// to load binaries that have unknown instructions. This is distinct from +// `simd.IS_EMULATED` as actually good designs support runtime feature +// detection and that constant establishes a baseline. +// +// See: +// - https://github.com/WebAssembly/design/issues/1161 +@(private = "file") +TARGET_IS_DESIGNED_BY_IDIOTS :: (ODIN_ARCH == .wasm64p32 || ODIN_ARCH == .wasm32) && !intrinsics.has_target_feature("simd128") + @(private = "file") _ROT_7L: simd.u32x4 : {7, 7, 7, 7} @(private = "file") @@ -205,11 +218,13 @@ _store_simd128 :: #force_inline proc "contextless" ( // is_performant returns true iff the target and current host both support // "enough" 128-bit SIMD to make this implementation performant. is_performant :: proc "contextless" () -> bool { - when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 || ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 { + when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 || ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 || ODIN_ARCH == .riscv64 { when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 { req_features :: info.CPU_Features{.asimd} } else when ODIN_ARCH == .amd64 || ODIN_ARCH == .i386 { req_features :: info.CPU_Features{.sse2, .ssse3} + } else when ODIN_ARCH == .riscv64 { + req_features :: info.CPU_Features{.V} } features, ok := info.cpu_features.? @@ -245,8 +260,17 @@ stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) // 8 blocks at a time. // - // Note: This is only worth it on Aarch64. - when ODIN_ARCH == .arm64 { + // Note: + // This uses a ton of registers so it is only worth it on targets + // that have something like 32 128-bit registers. This is currently + // all ARMv8 targets, and RISC-V Zvl128b (`V` application profile) + // targets. + // + // While our current definition of `.arm32` is 32-bit ARMv8, this + // may change in the future (ARMv7 is still relevant), and things + // like Cortex-A8/A9 does "pretend" 128-bit SIMD 64-bits at a time + // thus needs bemchmarking. + when ODIN_ARCH == .arm64 || ODIN_ARCH == .riscv64 { for ; n >= 8; n = n - 8 { v0, v1, v2, v3 := s0, s1, s2, s3 @@ -354,9 +378,11 @@ stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) // 4 blocks at a time. // - // Note: The i386 target lacks the required number of registers - // for this to be performant, so it is skipped. - when ODIN_ARCH != .i386 { + // Note: This is skipped on several targets for various reasons. + // - i386 lacks the required number of registers + // - Generating code when runtime "hardware" SIMD support is impossible + // to detect is pointless, since this will be emulated using GP regs. + when ODIN_ARCH != .i386 && !TARGET_IS_DESIGNED_BY_IDIOTS { for ; n >= 4; n = n - 4 { v0, v1, v2, v3 := s0, s1, s2, s3 diff --git a/core/crypto/_chacha20/simd256/chacha20_simd256_stub.odin b/core/crypto/_chacha20/simd256/chacha20_simd256_stub.odin index ce673b42b..287ddd885 100644 --- a/core/crypto/_chacha20/simd256/chacha20_simd256_stub.odin +++ b/core/crypto/_chacha20/simd256/chacha20_simd256_stub.odin @@ -13,5 +13,5 @@ stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) } hchacha20 :: proc "contextless" (dst, key, iv: []byte) { - intrinsics.trap() + panic_contextless("crypto/chacha20: simd256 implementation unsupported") } \ No newline at end of file diff --git a/core/crypto/_edwards25519/edwards25519.odin b/core/crypto/_edwards25519/edwards25519.odin index 6495f7a3a..d6f01d497 100644 --- a/core/crypto/_edwards25519/edwards25519.odin +++ b/core/crypto/_edwards25519/edwards25519.odin @@ -11,7 +11,6 @@ See: - https://www.hyperelliptic.org/EFD/g1p/auto-twisted-extended-1.html */ -import "base:intrinsics" import "core:crypto" import field "core:crypto/_fiat/field_curve25519" import "core:mem" @@ -32,6 +31,7 @@ import "core:mem" // - The group element decoding routine takes the opinionated stance of // rejecting non-canonical encodings. +@(rodata) FE_D := field.Tight_Field_Element { 929955233495203, 466365720129213, @@ -39,7 +39,7 @@ FE_D := field.Tight_Field_Element { 2033849074728123, 1442794654840575, } -@(private) +@(private, rodata) FE_A := field.Tight_Field_Element { 2251799813685228, 2251799813685247, @@ -47,7 +47,7 @@ FE_A := field.Tight_Field_Element { 2251799813685247, 2251799813685247, } -@(private) +@(private, rodata) FE_D2 := field.Tight_Field_Element { 1859910466990425, 932731440258426, @@ -55,7 +55,7 @@ FE_D2 := field.Tight_Field_Element { 1815898335770999, 633789495995903, } -@(private) +@(private, rodata) GE_BASEPOINT := Group_Element { field.Tight_Field_Element { 1738742601995546, @@ -80,6 +80,7 @@ GE_BASEPOINT := Group_Element { 1821297809914039, }, } +@(rodata) GE_IDENTITY := Group_Element { field.Tight_Field_Element{0, 0, 0, 0, 0}, field.Tight_Field_Element{1, 0, 0, 0, 0}, @@ -107,9 +108,7 @@ ge_set :: proc "contextless" (ge, a: ^Group_Element) { @(require_results) ge_set_bytes :: proc "contextless" (ge: ^Group_Element, b: []byte) -> bool { - if len(b) != 32 { - intrinsics.trap() - } + ensure_contextless(len(b) == 32, "edwards25519: invalid group element size") b_ := (^[32]byte)(raw_data(b)) // Do the work in a scratch element, so that ge is unchanged on @@ -166,9 +165,7 @@ ge_set_bytes :: proc "contextless" (ge: ^Group_Element, b: []byte) -> bool { } ge_bytes :: proc "contextless" (ge: ^Group_Element, dst: []byte) { - if len(dst) != 32 { - intrinsics.trap() - } + ensure_contextless(len(dst) == 32, "edwards25519: invalid group element size") dst_ := (^[32]byte)(raw_data(dst)) // Convert the element to affine (x, y) representation. diff --git a/core/crypto/_edwards25519/edwards25519_scalar.odin b/core/crypto/_edwards25519/edwards25519_scalar.odin index e21fa3755..68c79a6e8 100644 --- a/core/crypto/_edwards25519/edwards25519_scalar.odin +++ b/core/crypto/_edwards25519/edwards25519_scalar.odin @@ -1,6 +1,5 @@ package _edwards25519 -import "base:intrinsics" import field "core:crypto/_fiat/field_scalar25519" import "core:mem" @@ -8,7 +7,7 @@ Scalar :: field.Montgomery_Domain_Field_Element // WARNING: This is non-canonical and only to be used when checking if // a group element is on the prime-order subgroup. -@(private) +@(private, rodata) SC_ELL := field.Non_Montgomery_Domain_Field_Element { field.ELL[0], field.ELL[1], @@ -25,17 +24,13 @@ sc_set_u64 :: proc "contextless" (sc: ^Scalar, i: u64) { @(require_results) sc_set_bytes :: proc "contextless" (sc: ^Scalar, b: []byte) -> bool { - if len(b) != 32 { - intrinsics.trap() - } + ensure_contextless(len(b) == 32, "edwards25519: invalid scalar size") b_ := (^[32]byte)(raw_data(b)) return field.fe_from_bytes(sc, b_) } sc_set_bytes_rfc8032 :: proc "contextless" (sc: ^Scalar, b: []byte) { - if len(b) != 32 { - intrinsics.trap() - } + ensure_contextless(len(b) == 32, "edwards25519: invalid scalar size") b_ := (^[32]byte)(raw_data(b)) field.fe_from_bytes_rfc8032(sc, b_) } diff --git a/core/crypto/_fiat/field_curve25519/field51.odin b/core/crypto/_fiat/field_curve25519/field51.odin index d039bd411..6716fa158 100644 --- a/core/crypto/_fiat/field_curve25519/field51.odin +++ b/core/crypto/_fiat/field_curve25519/field51.odin @@ -42,9 +42,12 @@ import "core:math/bits" Loose_Field_Element :: distinct [5]u64 Tight_Field_Element :: distinct [5]u64 +@(rodata) FE_ZERO := Tight_Field_Element{0, 0, 0, 0, 0} +@(rodata) FE_ONE := Tight_Field_Element{1, 0, 0, 0, 0} +@(rodata) FE_SQRT_M1 := Tight_Field_Element { 1718705420411056, 234908883556509, diff --git a/core/crypto/_fiat/field_curve448/field.odin b/core/crypto/_fiat/field_curve448/field.odin new file mode 100644 index 000000000..540d88f28 --- /dev/null +++ b/core/crypto/_fiat/field_curve448/field.odin @@ -0,0 +1,235 @@ +package field_curve448 + +import "core:mem" + +fe_relax_cast :: #force_inline proc "contextless" ( + arg1: ^Tight_Field_Element, +) -> ^Loose_Field_Element { + return (^Loose_Field_Element)(arg1) +} + +fe_tighten_cast :: #force_inline proc "contextless" ( + arg1: ^Loose_Field_Element, +) -> ^Tight_Field_Element { + return (^Tight_Field_Element)(arg1) +} + +fe_clear :: proc "contextless" ( + arg1: $T, +) where T == ^Tight_Field_Element || T == ^Loose_Field_Element { + mem.zero_explicit(arg1, size_of(arg1^)) +} + +fe_clear_vec :: proc "contextless" ( + arg1: $T, +) where T == []^Tight_Field_Element || T == []^Loose_Field_Element { + for fe in arg1 { + fe_clear(fe) + } +} + +fe_carry_mul_small :: proc "contextless" ( + out1: ^Tight_Field_Element, + arg1: ^Loose_Field_Element, + arg2: u64, +) { + arg2_ := Loose_Field_Element{arg2, 0, 0, 0, 0, 0, 0, 0} + fe_carry_mul(out1, arg1, &arg2_) +} + +fe_carry_pow2k :: proc "contextless" ( + out1: ^Tight_Field_Element, + arg1: ^Loose_Field_Element, + arg2: uint, +) { + // Special case: `arg1^(2 * 0) = 1`, though this should never happen. + if arg2 == 0 { + fe_one(out1) + return + } + + fe_carry_square(out1, arg1) + for _ in 1 ..< arg2 { + fe_carry_square(out1, fe_relax_cast(out1)) + } +} + +fe_carry_inv :: proc "contextless" ( + out1: ^Tight_Field_Element, + arg1: ^Loose_Field_Element, +) { + // Inversion computation is derived from the addition chain: + // + // _10 = 2*1 + // _11 = 1 + _10 + // _110 = 2*_11 + // _111 = 1 + _110 + // _111000 = _111 << 3 + // _111111 = _111 + _111000 + // x12 = _111111 << 6 + _111111 + // x24 = x12 << 12 + x12 + // i34 = x24 << 6 + // x30 = _111111 + i34 + // x48 = i34 << 18 + x24 + // x96 = x48 << 48 + x48 + // x192 = x96 << 96 + x96 + // x222 = x192 << 30 + x30 + // x223 = 2*x222 + 1 + // return (x223 << 223 + x222) << 2 + 1 + // + // Operations: 447 squares 13 multiplies + // + // Generated by github.com/mmcloughlin/addchain v0.4.0. + + t0, t1, t2: Tight_Field_Element = ---, ---, --- + + // Step 1: t0 = x^0x2 + fe_carry_square(&t0, arg1) + + // Step 2: t0 = x^0x3 + fe_carry_mul(&t0, arg1, fe_relax_cast(&t0)) + + // t0.Sqr(t0) + fe_carry_square(&t0, fe_relax_cast(&t0)) + + // Step 4: t0 = x^0x7 + fe_carry_mul(&t0, arg1, fe_relax_cast(&t0)) + + // Step 7: t1 = x^0x38 + fe_carry_pow2k(&t1, fe_relax_cast(&t0), 3) + + // Step 8: t0 = x^0x3f + fe_carry_mul(&t0, fe_relax_cast(&t0), fe_relax_cast(&t1)) + + // Step 14: t1 = x^0xfc0 + fe_carry_pow2k(&t1, fe_relax_cast(&t0), 6) + + // Step 15: t1 = x^0xfff + fe_carry_mul(&t1, fe_relax_cast(&t0), fe_relax_cast(&t1)) + + // Step 27: t2 = x^0xfff000 + fe_carry_pow2k(&t2, fe_relax_cast(&t1), 12) + + // Step 28: t1 = x^0xffffff + fe_carry_mul(&t1, fe_relax_cast(&t1), fe_relax_cast(&t2)) + + // Step 34: t2 = x^0x3fffffc0 + fe_carry_pow2k(&t2, fe_relax_cast(&t1), 6) + + // Step 35: t0 = x^0x3fffffff + fe_carry_mul(&t0, fe_relax_cast(&t0), fe_relax_cast(&t2)) + + // Step 53: t2 = x^0xffffff000000 + fe_carry_pow2k(&t2, fe_relax_cast(&t2), 18) + + // Step 54: t1 = x^0xffffffffffff + fe_carry_mul(&t1, fe_relax_cast(&t1), fe_relax_cast(&t2)) + + // Step 102: t2 = x^0xffffffffffff000000000000 + fe_carry_pow2k(&t2, fe_relax_cast(&t1), 48) + + // Step 103: t1 = x^0xffffffffffffffffffffffff + fe_carry_mul(&t1, fe_relax_cast(&t1), fe_relax_cast(&t2)) + + // Step 199: t2 = x^0xffffffffffffffffffffffff000000000000000000000000 + fe_carry_pow2k(&t2, fe_relax_cast(&t1), 96) + + // Step 200: t1 = x^0xffffffffffffffffffffffffffffffffffffffffffffffff + fe_carry_mul(&t1, fe_relax_cast(&t1), fe_relax_cast(&t2)) + + // Step 230: t1 = x^0x3fffffffffffffffffffffffffffffffffffffffffffffffc0000000 + fe_carry_pow2k(&t1, fe_relax_cast(&t1), 30) + + // Step 231: t0 = x^0x3fffffffffffffffffffffffffffffffffffffffffffffffffffffff + fe_carry_mul(&t0, fe_relax_cast(&t0), fe_relax_cast(&t1)) + + // Step 232: t1 = x^0x7ffffffffffffffffffffffffffffffffffffffffffffffffffffffe + fe_carry_square(&t1, fe_relax_cast(&t0)) + + // Step 233: t1 = x^0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffff + fe_carry_mul(&t1, arg1, fe_relax_cast(&t1)) + + // Step 456: t1 = x^0x3fffffffffffffffffffffffffffffffffffffffffffffffffffffff80000000000000000000000000000000000000000000000000000000 + fe_carry_pow2k(&t1, fe_relax_cast(&t1), 223) + + // Step 457: t0 = x^0x3fffffffffffffffffffffffffffffffffffffffffffffffffffffffbfffffffffffffffffffffffffffffffffffffffffffffffffffffff + fe_carry_mul(&t0, fe_relax_cast(&t0), fe_relax_cast(&t1)) + + // Step 459: t0 = x^0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffffffffffffffffffffffffffffffffffffffffffffffffffffc + fe_carry_pow2k(&t0, fe_relax_cast(&t0), 2) + + // Step 460: z = x^0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffffffffffffffffffffffffffffffffffffffffffffffffffffd + fe_carry_mul(out1, arg1, fe_relax_cast(&t0)) + + fe_clear_vec([]^Tight_Field_Element{&t0, &t1, &t2}) +} + +fe_zero :: proc "contextless" (out1: ^Tight_Field_Element) { + out1[0] = 0 + out1[1] = 0 + out1[2] = 0 + out1[3] = 0 + out1[4] = 0 + out1[5] = 0 + out1[6] = 0 + out1[7] = 0 +} + +fe_one :: proc "contextless" (out1: ^Tight_Field_Element) { + out1[0] = 1 + out1[1] = 0 + out1[2] = 0 + out1[3] = 0 + out1[4] = 0 + out1[5] = 0 + out1[6] = 0 + out1[7] = 0 +} + +fe_set :: proc "contextless" (out1, arg1: ^Tight_Field_Element) { + x1 := arg1[0] + x2 := arg1[1] + x3 := arg1[2] + x4 := arg1[3] + x5 := arg1[4] + x6 := arg1[5] + x7 := arg1[6] + x8 := arg1[7] + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 + out1[3] = x4 + out1[4] = x5 + out1[5] = x6 + out1[6] = x7 + out1[7] = x8 +} + +@(optimization_mode = "none") +fe_cond_swap :: #force_no_inline proc "contextless" (out1, out2: ^Tight_Field_Element, arg1: int) { + mask := (u64(arg1) * 0xffffffffffffffff) + x := (out1[0] ~ out2[0]) & mask + x1, y1 := out1[0] ~ x, out2[0] ~ x + x = (out1[1] ~ out2[1]) & mask + x2, y2 := out1[1] ~ x, out2[1] ~ x + x = (out1[2] ~ out2[2]) & mask + x3, y3 := out1[2] ~ x, out2[2] ~ x + x = (out1[3] ~ out2[3]) & mask + x4, y4 := out1[3] ~ x, out2[3] ~ x + x = (out1[4] ~ out2[4]) & mask + x5, y5 := out1[4] ~ x, out2[4] ~ x + x = (out1[5] ~ out2[5]) & mask + x6, y6 := out1[5] ~ x, out2[5] ~ x + x = (out1[6] ~ out2[6]) & mask + x7, y7 := out1[6] ~ x, out2[6] ~ x + x = (out1[7] ~ out2[7]) & mask + x8, y8 := out1[7] ~ x, out2[7] ~ x + out1[0], out2[0] = x1, y1 + out1[1], out2[1] = x2, y2 + out1[2], out2[2] = x3, y3 + out1[3], out2[3] = x4, y4 + out1[4], out2[4] = x5, y5 + out1[5], out2[5] = x6, y6 + out1[6], out2[6] = x7, y7 + out1[7], out2[7] = x8, y8 +} \ No newline at end of file diff --git a/core/crypto/_fiat/field_curve448/field51.odin b/core/crypto/_fiat/field_curve448/field51.odin new file mode 100644 index 000000000..d8e49e04d --- /dev/null +++ b/core/crypto/_fiat/field_curve448/field51.odin @@ -0,0 +1,1060 @@ +// The BSD 1-Clause License (BSD-1-Clause) +// +// Copyright (c) 2015-2020 the fiat-crypto authors (see the AUTHORS file) +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// THIS SOFTWARE IS PROVIDED BY the fiat-crypto authors "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Berkeley Software Design, +// Inc. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +package field_curve448 + +// The file provides arithmetic on the field Z/(2^448 - 2^224 - 1) using +// unsaturated 64-bit integer arithmetic. It is derived primarily +// from the machine generated Golang output from the fiat-crypto project. +// +// While the base implementation is provably correct, this implementation +// makes no such claims as the port and optimizations were done by hand. +// +// TODO: +// * When fiat-crypto supports it, using a saturated 64-bit limbs +// instead of 56-bit limbs will be faster, though the gains are +// minimal unless adcx/adox/mulx are used. + +import fiat "core:crypto/_fiat" +import "core:math/bits" + +Loose_Field_Element :: distinct [8]u64 +Tight_Field_Element :: distinct [8]u64 + +@(rodata) +FE_ZERO := Tight_Field_Element{0, 0, 0, 0, 0, 0, 0, 0} +@(rodata) +FE_ONE := Tight_Field_Element{1, 0, 0, 0, 0, 0, 0, 0} + +_addcarryx_u56 :: #force_inline proc "contextless" ( + arg1: fiat.u1, + arg2, arg3: u64, +) -> ( + out1: u64, + out2: fiat.u1, +) { + x1 := ((u64(arg1) + arg2) + arg3) + x2 := (x1 & 0xffffffffffffff) + x3 := fiat.u1((x1 >> 56)) + out1 = x2 + out2 = x3 + return +} + +_subborrowx_u56 :: #force_inline proc "contextless" ( + arg1: fiat.u1, + arg2, arg3: u64, +) -> ( + out1: u64, + out2: fiat.u1, +) { + x1 := ((i64(arg2) - i64(arg1)) - i64(arg3)) + x2 := fiat.u1((x1 >> 56)) + x3 := (u64(x1) & 0xffffffffffffff) + out1 = x3 + out2 = (0x0 - fiat.u1(x2)) + return +} + +fe_carry_mul :: proc "contextless" (out1: ^Tight_Field_Element, arg1, arg2: ^Loose_Field_Element) { + x2, x1 := bits.mul_u64(arg1[7], arg2[7]) + x4, x3 := bits.mul_u64(arg1[7], arg2[6]) + x6, x5 := bits.mul_u64(arg1[7], arg2[5]) + x8, x7 := bits.mul_u64(arg1[6], arg2[7]) + x10, x9 := bits.mul_u64(arg1[6], arg2[6]) + x12, x11 := bits.mul_u64(arg1[5], arg2[7]) + x14, x13 := bits.mul_u64(arg1[7], arg2[7]) + x16, x15 := bits.mul_u64(arg1[7], arg2[6]) + x18, x17 := bits.mul_u64(arg1[7], arg2[5]) + x20, x19 := bits.mul_u64(arg1[6], arg2[7]) + x22, x21 := bits.mul_u64(arg1[6], arg2[6]) + x24, x23 := bits.mul_u64(arg1[5], arg2[7]) + x26, x25 := bits.mul_u64(arg1[7], arg2[7]) + x28, x27 := bits.mul_u64(arg1[7], arg2[6]) + x30, x29 := bits.mul_u64(arg1[7], arg2[5]) + x32, x31 := bits.mul_u64(arg1[7], arg2[4]) + x34, x33 := bits.mul_u64(arg1[7], arg2[3]) + x36, x35 := bits.mul_u64(arg1[7], arg2[2]) + x38, x37 := bits.mul_u64(arg1[7], arg2[1]) + x40, x39 := bits.mul_u64(arg1[6], arg2[7]) + x42, x41 := bits.mul_u64(arg1[6], arg2[6]) + x44, x43 := bits.mul_u64(arg1[6], arg2[5]) + x46, x45 := bits.mul_u64(arg1[6], arg2[4]) + x48, x47 := bits.mul_u64(arg1[6], arg2[3]) + x50, x49 := bits.mul_u64(arg1[6], arg2[2]) + x52, x51 := bits.mul_u64(arg1[5], arg2[7]) + x54, x53 := bits.mul_u64(arg1[5], arg2[6]) + x56, x55 := bits.mul_u64(arg1[5], arg2[5]) + x58, x57 := bits.mul_u64(arg1[5], arg2[4]) + x60, x59 := bits.mul_u64(arg1[5], arg2[3]) + x62, x61 := bits.mul_u64(arg1[4], arg2[7]) + x64, x63 := bits.mul_u64(arg1[4], arg2[6]) + x66, x65 := bits.mul_u64(arg1[4], arg2[5]) + x68, x67 := bits.mul_u64(arg1[4], arg2[4]) + x70, x69 := bits.mul_u64(arg1[3], arg2[7]) + x72, x71 := bits.mul_u64(arg1[3], arg2[6]) + x74, x73 := bits.mul_u64(arg1[3], arg2[5]) + x76, x75 := bits.mul_u64(arg1[2], arg2[7]) + x78, x77 := bits.mul_u64(arg1[2], arg2[6]) + x80, x79 := bits.mul_u64(arg1[1], arg2[7]) + x82, x81 := bits.mul_u64(arg1[7], arg2[4]) + x84, x83 := bits.mul_u64(arg1[7], arg2[3]) + x86, x85 := bits.mul_u64(arg1[7], arg2[2]) + x88, x87 := bits.mul_u64(arg1[7], arg2[1]) + x90, x89 := bits.mul_u64(arg1[6], arg2[5]) + x92, x91 := bits.mul_u64(arg1[6], arg2[4]) + x94, x93 := bits.mul_u64(arg1[6], arg2[3]) + x96, x95 := bits.mul_u64(arg1[6], arg2[2]) + x98, x97 := bits.mul_u64(arg1[5], arg2[6]) + x100, x99 := bits.mul_u64(arg1[5], arg2[5]) + x102, x101 := bits.mul_u64(arg1[5], arg2[4]) + x104, x103 := bits.mul_u64(arg1[5], arg2[3]) + x106, x105 := bits.mul_u64(arg1[4], arg2[7]) + x108, x107 := bits.mul_u64(arg1[4], arg2[6]) + x110, x109 := bits.mul_u64(arg1[4], arg2[5]) + x112, x111 := bits.mul_u64(arg1[4], arg2[4]) + x114, x113 := bits.mul_u64(arg1[3], arg2[7]) + x116, x115 := bits.mul_u64(arg1[3], arg2[6]) + x118, x117 := bits.mul_u64(arg1[3], arg2[5]) + x120, x119 := bits.mul_u64(arg1[2], arg2[7]) + x122, x121 := bits.mul_u64(arg1[2], arg2[6]) + x124, x123 := bits.mul_u64(arg1[1], arg2[7]) + x126, x125 := bits.mul_u64(arg1[7], arg2[0]) + x128, x127 := bits.mul_u64(arg1[6], arg2[1]) + x130, x129 := bits.mul_u64(arg1[6], arg2[0]) + x132, x131 := bits.mul_u64(arg1[5], arg2[2]) + x134, x133 := bits.mul_u64(arg1[5], arg2[1]) + x136, x135 := bits.mul_u64(arg1[5], arg2[0]) + x138, x137 := bits.mul_u64(arg1[4], arg2[3]) + x140, x139 := bits.mul_u64(arg1[4], arg2[2]) + x142, x141 := bits.mul_u64(arg1[4], arg2[1]) + x144, x143 := bits.mul_u64(arg1[4], arg2[0]) + x146, x145 := bits.mul_u64(arg1[3], arg2[4]) + x148, x147 := bits.mul_u64(arg1[3], arg2[3]) + x150, x149 := bits.mul_u64(arg1[3], arg2[2]) + x152, x151 := bits.mul_u64(arg1[3], arg2[1]) + x154, x153 := bits.mul_u64(arg1[3], arg2[0]) + x156, x155 := bits.mul_u64(arg1[2], arg2[5]) + x158, x157 := bits.mul_u64(arg1[2], arg2[4]) + x160, x159 := bits.mul_u64(arg1[2], arg2[3]) + x162, x161 := bits.mul_u64(arg1[2], arg2[2]) + x164, x163 := bits.mul_u64(arg1[2], arg2[1]) + x166, x165 := bits.mul_u64(arg1[2], arg2[0]) + x168, x167 := bits.mul_u64(arg1[1], arg2[6]) + x170, x169 := bits.mul_u64(arg1[1], arg2[5]) + x172, x171 := bits.mul_u64(arg1[1], arg2[4]) + x174, x173 := bits.mul_u64(arg1[1], arg2[3]) + x176, x175 := bits.mul_u64(arg1[1], arg2[2]) + x178, x177 := bits.mul_u64(arg1[1], arg2[1]) + x180, x179 := bits.mul_u64(arg1[1], arg2[0]) + x182, x181 := bits.mul_u64(arg1[0], arg2[7]) + x184, x183 := bits.mul_u64(arg1[0], arg2[6]) + x186, x185 := bits.mul_u64(arg1[0], arg2[5]) + x188, x187 := bits.mul_u64(arg1[0], arg2[4]) + x190, x189 := bits.mul_u64(arg1[0], arg2[3]) + x192, x191 := bits.mul_u64(arg1[0], arg2[2]) + x194, x193 := bits.mul_u64(arg1[0], arg2[1]) + x196, x195 := bits.mul_u64(arg1[0], arg2[0]) + x197, x198 := bits.add_u64(x43, x31, u64(0x0)) + x199, _ := bits.add_u64(x44, x32, u64(fiat.u1(x198))) + x201, x202 := bits.add_u64(x53, x197, u64(0x0)) + x203, _ := bits.add_u64(x54, x199, u64(fiat.u1(x202))) + x205, x206 := bits.add_u64(x61, x201, u64(0x0)) + x207, _ := bits.add_u64(x62, x203, u64(fiat.u1(x206))) + x209, x210 := bits.add_u64(x153, x205, u64(0x0)) + x211, _ := bits.add_u64(x154, x207, u64(fiat.u1(x210))) + x213, x214 := bits.add_u64(x163, x209, u64(0x0)) + x215, _ := bits.add_u64(x164, x211, u64(fiat.u1(x214))) + x217, x218 := bits.add_u64(x175, x213, u64(0x0)) + x219, _ := bits.add_u64(x176, x215, u64(fiat.u1(x218))) + x221, x222 := bits.add_u64(x189, x217, u64(0x0)) + x223, _ := bits.add_u64(x190, x219, u64(fiat.u1(x222))) + x225 := ((x221 >> 56) | ((x223 << 8) & 0xffffffffffffffff)) + x226 := (x221 & 0xffffffffffffff) + x227, x228 := bits.add_u64(x89, x81, u64(0x0)) + x229, _ := bits.add_u64(x90, x82, u64(fiat.u1(x228))) + x231, x232 := bits.add_u64(x97, x227, u64(0x0)) + x233, _ := bits.add_u64(x98, x229, u64(fiat.u1(x232))) + x235, x236 := bits.add_u64(x105, x231, u64(0x0)) + x237, _ := bits.add_u64(x106, x233, u64(fiat.u1(x236))) + x239, x240 := bits.add_u64(x125, x235, u64(0x0)) + x241, _ := bits.add_u64(x126, x237, u64(fiat.u1(x240))) + x243, x244 := bits.add_u64(x127, x239, u64(0x0)) + x245, _ := bits.add_u64(x128, x241, u64(fiat.u1(x244))) + x247, x248 := bits.add_u64(x131, x243, u64(0x0)) + x249, _ := bits.add_u64(x132, x245, u64(fiat.u1(x248))) + x251, x252 := bits.add_u64(x137, x247, u64(0x0)) + x253, _ := bits.add_u64(x138, x249, u64(fiat.u1(x252))) + x255, x256 := bits.add_u64(x145, x251, u64(0x0)) + x257, _ := bits.add_u64(x146, x253, u64(fiat.u1(x256))) + x259, x260 := bits.add_u64(x155, x255, u64(0x0)) + x261, _ := bits.add_u64(x156, x257, u64(fiat.u1(x260))) + x263, x264 := bits.add_u64(x167, x259, u64(0x0)) + x265, _ := bits.add_u64(x168, x261, u64(fiat.u1(x264))) + x267, x268 := bits.add_u64(x181, x263, u64(0x0)) + x269, _ := bits.add_u64(x182, x265, u64(fiat.u1(x268))) + x271, x272 := bits.add_u64(x25, x13, u64(0x0)) + x273, _ := bits.add_u64(x26, x14, u64(fiat.u1(x272))) + x275, x276 := bits.add_u64(x83, x271, u64(0x0)) + x277, _ := bits.add_u64(x84, x273, u64(fiat.u1(x276))) + x279, x280 := bits.add_u64(x91, x275, u64(0x0)) + x281, _ := bits.add_u64(x92, x277, u64(fiat.u1(x280))) + x283, x284 := bits.add_u64(x99, x279, u64(0x0)) + x285, _ := bits.add_u64(x100, x281, u64(fiat.u1(x284))) + x287, x288 := bits.add_u64(x107, x283, u64(0x0)) + x289, _ := bits.add_u64(x108, x285, u64(fiat.u1(x288))) + x291, x292 := bits.add_u64(x113, x287, u64(0x0)) + x293, _ := bits.add_u64(x114, x289, u64(fiat.u1(x292))) + x295, x296 := bits.add_u64(x129, x291, u64(0x0)) + x297, _ := bits.add_u64(x130, x293, u64(fiat.u1(x296))) + x299, x300 := bits.add_u64(x133, x295, u64(0x0)) + x301, _ := bits.add_u64(x134, x297, u64(fiat.u1(x300))) + x303, x304 := bits.add_u64(x139, x299, u64(0x0)) + x305, _ := bits.add_u64(x140, x301, u64(fiat.u1(x304))) + x307, x308 := bits.add_u64(x147, x303, u64(0x0)) + x309, _ := bits.add_u64(x148, x305, u64(fiat.u1(x308))) + x311, x312 := bits.add_u64(x157, x307, u64(0x0)) + x313, _ := bits.add_u64(x158, x309, u64(fiat.u1(x312))) + x315, x316 := bits.add_u64(x169, x311, u64(0x0)) + x317, _ := bits.add_u64(x170, x313, u64(fiat.u1(x316))) + x319, x320 := bits.add_u64(x183, x315, u64(0x0)) + x321, _ := bits.add_u64(x184, x317, u64(fiat.u1(x320))) + x323, x324 := bits.add_u64(x19, x15, u64(0x0)) + x325, _ := bits.add_u64(x20, x16, u64(fiat.u1(x324))) + x327, x328 := bits.add_u64(x27, x323, u64(0x0)) + x329, _ := bits.add_u64(x28, x325, u64(fiat.u1(x328))) + x331, x332 := bits.add_u64(x39, x327, u64(0x0)) + x333, _ := bits.add_u64(x40, x329, u64(fiat.u1(x332))) + x335, x336 := bits.add_u64(x85, x331, u64(0x0)) + x337, _ := bits.add_u64(x86, x333, u64(fiat.u1(x336))) + x339, x340 := bits.add_u64(x93, x335, u64(0x0)) + x341, _ := bits.add_u64(x94, x337, u64(fiat.u1(x340))) + x343, x344 := bits.add_u64(x101, x339, u64(0x0)) + x345, _ := bits.add_u64(x102, x341, u64(fiat.u1(x344))) + x347, x348 := bits.add_u64(x109, x343, u64(0x0)) + x349, _ := bits.add_u64(x110, x345, u64(fiat.u1(x348))) + x351, x352 := bits.add_u64(x115, x347, u64(0x0)) + x353, _ := bits.add_u64(x116, x349, u64(fiat.u1(x352))) + x355, x356 := bits.add_u64(x119, x351, u64(0x0)) + x357, _ := bits.add_u64(x120, x353, u64(fiat.u1(x356))) + x359, x360 := bits.add_u64(x135, x355, u64(0x0)) + x361, _ := bits.add_u64(x136, x357, u64(fiat.u1(x360))) + x363, x364 := bits.add_u64(x141, x359, u64(0x0)) + x365, _ := bits.add_u64(x142, x361, u64(fiat.u1(x364))) + x367, x368 := bits.add_u64(x149, x363, u64(0x0)) + x369, _ := bits.add_u64(x150, x365, u64(fiat.u1(x368))) + x371, x372 := bits.add_u64(x159, x367, u64(0x0)) + x373, _ := bits.add_u64(x160, x369, u64(fiat.u1(x372))) + x375, x376 := bits.add_u64(x171, x371, u64(0x0)) + x377, _ := bits.add_u64(x172, x373, u64(fiat.u1(x376))) + x379, x380 := bits.add_u64(x185, x375, u64(0x0)) + x381, _ := bits.add_u64(x186, x377, u64(fiat.u1(x380))) + x383, x384 := bits.add_u64(x21, x17, u64(0x0)) + x385, _ := bits.add_u64(x22, x18, u64(fiat.u1(x384))) + x387, x388 := bits.add_u64(x23, x383, u64(0x0)) + x389, _ := bits.add_u64(x24, x385, u64(fiat.u1(x388))) + x391, x392 := bits.add_u64(x29, x387, u64(0x0)) + x393, _ := bits.add_u64(x30, x389, u64(fiat.u1(x392))) + x395, x396 := bits.add_u64(x41, x391, u64(0x0)) + x397, _ := bits.add_u64(x42, x393, u64(fiat.u1(x396))) + x399, x400 := bits.add_u64(x51, x395, u64(0x0)) + x401, _ := bits.add_u64(x52, x397, u64(fiat.u1(x400))) + x403, x404 := bits.add_u64(x87, x399, u64(0x0)) + x405, _ := bits.add_u64(x88, x401, u64(fiat.u1(x404))) + x407, x408 := bits.add_u64(x95, x403, u64(0x0)) + x409, _ := bits.add_u64(x96, x405, u64(fiat.u1(x408))) + x411, x412 := bits.add_u64(x103, x407, u64(0x0)) + x413, _ := bits.add_u64(x104, x409, u64(fiat.u1(x412))) + x415, x416 := bits.add_u64(x111, x411, u64(0x0)) + x417, _ := bits.add_u64(x112, x413, u64(fiat.u1(x416))) + x419, x420 := bits.add_u64(x117, x415, u64(0x0)) + x421, _ := bits.add_u64(x118, x417, u64(fiat.u1(x420))) + x423, x424 := bits.add_u64(x121, x419, u64(0x0)) + x425, _ := bits.add_u64(x122, x421, u64(fiat.u1(x424))) + x427, x428 := bits.add_u64(x123, x423, u64(0x0)) + x429, _ := bits.add_u64(x124, x425, u64(fiat.u1(x428))) + x431, x432 := bits.add_u64(x143, x427, u64(0x0)) + x433, _ := bits.add_u64(x144, x429, u64(fiat.u1(x432))) + x435, x436 := bits.add_u64(x151, x431, u64(0x0)) + x437, _ := bits.add_u64(x152, x433, u64(fiat.u1(x436))) + x439, x440 := bits.add_u64(x161, x435, u64(0x0)) + x441, _ := bits.add_u64(x162, x437, u64(fiat.u1(x440))) + x443, x444 := bits.add_u64(x173, x439, u64(0x0)) + x445, _ := bits.add_u64(x174, x441, u64(fiat.u1(x444))) + x447, x448 := bits.add_u64(x187, x443, u64(0x0)) + x449, _ := bits.add_u64(x188, x445, u64(fiat.u1(x448))) + x451, x452 := bits.add_u64(x33, x1, u64(0x0)) + x453, _ := bits.add_u64(x34, x2, u64(fiat.u1(x452))) + x455, x456 := bits.add_u64(x45, x451, u64(0x0)) + x457, _ := bits.add_u64(x46, x453, u64(fiat.u1(x456))) + x459, x460 := bits.add_u64(x55, x455, u64(0x0)) + x461, _ := bits.add_u64(x56, x457, u64(fiat.u1(x460))) + x463, x464 := bits.add_u64(x63, x459, u64(0x0)) + x465, _ := bits.add_u64(x64, x461, u64(fiat.u1(x464))) + x467, x468 := bits.add_u64(x69, x463, u64(0x0)) + x469, _ := bits.add_u64(x70, x465, u64(fiat.u1(x468))) + x471, x472 := bits.add_u64(x165, x467, u64(0x0)) + x473, _ := bits.add_u64(x166, x469, u64(fiat.u1(x472))) + x475, x476 := bits.add_u64(x177, x471, u64(0x0)) + x477, _ := bits.add_u64(x178, x473, u64(fiat.u1(x476))) + x479, x480 := bits.add_u64(x191, x475, u64(0x0)) + x481, _ := bits.add_u64(x192, x477, u64(fiat.u1(x480))) + x483, x484 := bits.add_u64(x7, x3, u64(0x0)) + x485, _ := bits.add_u64(x8, x4, u64(fiat.u1(x484))) + x487, x488 := bits.add_u64(x35, x483, u64(0x0)) + x489, _ := bits.add_u64(x36, x485, u64(fiat.u1(x488))) + x491, x492 := bits.add_u64(x47, x487, u64(0x0)) + x493, _ := bits.add_u64(x48, x489, u64(fiat.u1(x492))) + x495, x496 := bits.add_u64(x57, x491, u64(0x0)) + x497, _ := bits.add_u64(x58, x493, u64(fiat.u1(x496))) + x499, x500 := bits.add_u64(x65, x495, u64(0x0)) + x501, _ := bits.add_u64(x66, x497, u64(fiat.u1(x500))) + x503, x504 := bits.add_u64(x71, x499, u64(0x0)) + x505, _ := bits.add_u64(x72, x501, u64(fiat.u1(x504))) + x507, x508 := bits.add_u64(x75, x503, u64(0x0)) + x509, _ := bits.add_u64(x76, x505, u64(fiat.u1(x508))) + x511, x512 := bits.add_u64(x179, x507, u64(0x0)) + x513, _ := bits.add_u64(x180, x509, u64(fiat.u1(x512))) + x515, x516 := bits.add_u64(x193, x511, u64(0x0)) + x517, _ := bits.add_u64(x194, x513, u64(fiat.u1(x516))) + x519, x520 := bits.add_u64(x9, x5, u64(0x0)) + x521, _ := bits.add_u64(x10, x6, u64(fiat.u1(x520))) + x523, x524 := bits.add_u64(x11, x519, u64(0x0)) + x525, _ := bits.add_u64(x12, x521, u64(fiat.u1(x524))) + x527, x528 := bits.add_u64(x37, x523, u64(0x0)) + x529, _ := bits.add_u64(x38, x525, u64(fiat.u1(x528))) + x531, x532 := bits.add_u64(x49, x527, u64(0x0)) + x533, _ := bits.add_u64(x50, x529, u64(fiat.u1(x532))) + x535, x536 := bits.add_u64(x59, x531, u64(0x0)) + x537, _ := bits.add_u64(x60, x533, u64(fiat.u1(x536))) + x539, x540 := bits.add_u64(x67, x535, u64(0x0)) + x541, _ := bits.add_u64(x68, x537, u64(fiat.u1(x540))) + x543, x544 := bits.add_u64(x73, x539, u64(0x0)) + x545, _ := bits.add_u64(x74, x541, u64(fiat.u1(x544))) + x547, x548 := bits.add_u64(x77, x543, u64(0x0)) + x549, _ := bits.add_u64(x78, x545, u64(fiat.u1(x548))) + x551, x552 := bits.add_u64(x79, x547, u64(0x0)) + x553, _ := bits.add_u64(x80, x549, u64(fiat.u1(x552))) + x555, x556 := bits.add_u64(x195, x551, u64(0x0)) + x557, _ := bits.add_u64(x196, x553, u64(fiat.u1(x556))) + x559, x560 := bits.add_u64(x225, x447, u64(0x0)) + x561 := (u64(fiat.u1(x560)) + x449) + x562 := ((x267 >> 56) | ((x269 << 8) & 0xffffffffffffffff)) + x563 := (x267 & 0xffffffffffffff) + x564, x565 := bits.add_u64(x559, x562, u64(0x0)) + x566 := (u64(fiat.u1(x565)) + x561) + x567 := ((x564 >> 56) | ((x566 << 8) & 0xffffffffffffffff)) + x568 := (x564 & 0xffffffffffffff) + x569, x570 := bits.add_u64(x555, x562, u64(0x0)) + x571 := (u64(fiat.u1(x570)) + x557) + x572, x573 := bits.add_u64(x567, x379, u64(0x0)) + x574 := (u64(fiat.u1(x573)) + x381) + x575 := ((x569 >> 56) | ((x571 << 8) & 0xffffffffffffffff)) + x576 := (x569 & 0xffffffffffffff) + x577, x578 := bits.add_u64(x575, x515, u64(0x0)) + x579 := (u64(fiat.u1(x578)) + x517) + x580 := ((x572 >> 56) | ((x574 << 8) & 0xffffffffffffffff)) + x581 := (x572 & 0xffffffffffffff) + x582, x583 := bits.add_u64(x580, x319, u64(0x0)) + x584 := (u64(fiat.u1(x583)) + x321) + x585 := ((x577 >> 56) | ((x579 << 8) & 0xffffffffffffffff)) + x586 := (x577 & 0xffffffffffffff) + x587, x588 := bits.add_u64(x585, x479, u64(0x0)) + x589 := (u64(fiat.u1(x588)) + x481) + x590 := ((x582 >> 56) | ((x584 << 8) & 0xffffffffffffffff)) + x591 := (x582 & 0xffffffffffffff) + x592 := (x590 + x563) + x593 := ((x587 >> 56) | ((x589 << 8) & 0xffffffffffffffff)) + x594 := (x587 & 0xffffffffffffff) + x595 := (x593 + x226) + x596 := (x592 >> 56) + x597 := (x592 & 0xffffffffffffff) + x598 := (x595 >> 56) + x599 := (x595 & 0xffffffffffffff) + x600 := (x568 + x596) + x601 := (x576 + x596) + x602 := (x598 + x600) + x603 := fiat.u1((x602 >> 56)) + x604 := (x602 & 0xffffffffffffff) + x605 := (u64(x603) + x581) + x606 := fiat.u1((x601 >> 56)) + x607 := (x601 & 0xffffffffffffff) + x608 := (u64(x606) + x586) + out1[0] = x607 + out1[1] = x608 + out1[2] = x594 + out1[3] = x599 + out1[4] = x604 + out1[5] = x605 + out1[6] = x591 + out1[7] = x597 +} + +fe_carry_square :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) { + x1 := arg1[7] + x2 := arg1[7] + x3 := (x1 * 0x2) + x4 := (x2 * 0x2) + x5 := (arg1[7] * 0x2) + x6 := arg1[6] + x7 := arg1[6] + x8 := (x6 * 0x2) + x9 := (x7 * 0x2) + x10 := (arg1[6] * 0x2) + x11 := arg1[5] + x12 := arg1[5] + x13 := (x11 * 0x2) + x14 := (x12 * 0x2) + x15 := (arg1[5] * 0x2) + x16 := arg1[4] + x17 := arg1[4] + x18 := (arg1[4] * 0x2) + x19 := (arg1[3] * 0x2) + x20 := (arg1[2] * 0x2) + x21 := (arg1[1] * 0x2) + x23, x22 := bits.mul_u64(arg1[7], x1) + x25, x24 := bits.mul_u64(arg1[6], x3) + x27, x26 := bits.mul_u64(arg1[6], x6) + x29, x28 := bits.mul_u64(arg1[5], x3) + x31, x30 := bits.mul_u64(arg1[7], x1) + x33, x32 := bits.mul_u64(arg1[6], x3) + x35, x34 := bits.mul_u64(arg1[6], x6) + x37, x36 := bits.mul_u64(arg1[5], x3) + x39, x38 := bits.mul_u64(arg1[7], x2) + x41, x40 := bits.mul_u64(arg1[6], x4) + x43, x42 := bits.mul_u64(arg1[6], x7) + x45, x44 := bits.mul_u64(arg1[5], x4) + x47, x46 := bits.mul_u64(arg1[5], x9) + x49, x48 := bits.mul_u64(arg1[5], x8) + x51, x50 := bits.mul_u64(arg1[5], x12) + x53, x52 := bits.mul_u64(arg1[5], x11) + x55, x54 := bits.mul_u64(arg1[4], x4) + x57, x56 := bits.mul_u64(arg1[4], x3) + x59, x58 := bits.mul_u64(arg1[4], x9) + x61, x60 := bits.mul_u64(arg1[4], x8) + x63, x62 := bits.mul_u64(arg1[4], x14) + x65, x64 := bits.mul_u64(arg1[4], x13) + x67, x66 := bits.mul_u64(arg1[4], x17) + x69, x68 := bits.mul_u64(arg1[4], x16) + x71, x70 := bits.mul_u64(arg1[3], x4) + x73, x72 := bits.mul_u64(arg1[3], x3) + x75, x74 := bits.mul_u64(arg1[3], x9) + x77, x76 := bits.mul_u64(arg1[3], x8) + x79, x78 := bits.mul_u64(arg1[3], x14) + x81, x80 := bits.mul_u64(arg1[3], x13) + x83, x82 := bits.mul_u64(arg1[3], x18) + x85, x84 := bits.mul_u64(arg1[3], arg1[3]) + x87, x86 := bits.mul_u64(arg1[2], x4) + x89, x88 := bits.mul_u64(arg1[2], x3) + x91, x90 := bits.mul_u64(arg1[2], x9) + x93, x92 := bits.mul_u64(arg1[2], x8) + x95, x94 := bits.mul_u64(arg1[2], x15) + x97, x96 := bits.mul_u64(arg1[2], x18) + x99, x98 := bits.mul_u64(arg1[2], x19) + x101, x100 := bits.mul_u64(arg1[2], arg1[2]) + x103, x102 := bits.mul_u64(arg1[1], x4) + x105, x104 := bits.mul_u64(arg1[1], x3) + x107, x106 := bits.mul_u64(arg1[1], x10) + x109, x108 := bits.mul_u64(arg1[1], x15) + x111, x110 := bits.mul_u64(arg1[1], x18) + x113, x112 := bits.mul_u64(arg1[1], x19) + x115, x114 := bits.mul_u64(arg1[1], x20) + x117, x116 := bits.mul_u64(arg1[1], arg1[1]) + x119, x118 := bits.mul_u64(arg1[0], x5) + x121, x120 := bits.mul_u64(arg1[0], x10) + x123, x122 := bits.mul_u64(arg1[0], x15) + x125, x124 := bits.mul_u64(arg1[0], x18) + x127, x126 := bits.mul_u64(arg1[0], x19) + x129, x128 := bits.mul_u64(arg1[0], x20) + x131, x130 := bits.mul_u64(arg1[0], x21) + x133, x132 := bits.mul_u64(arg1[0], arg1[0]) + x134, x135 := bits.add_u64(x54, x46, u64(0x0)) + x136, _ := bits.add_u64(x55, x47, u64(fiat.u1(x135))) + x138, x139 := bits.add_u64(x114, x134, u64(0x0)) + x140, _ := bits.add_u64(x115, x136, u64(fiat.u1(x139))) + x142, x143 := bits.add_u64(x126, x138, u64(0x0)) + x144, _ := bits.add_u64(x127, x140, u64(fiat.u1(x143))) + x146 := ((x142 >> 56) | ((x144 << 8) & 0xffffffffffffffff)) + x147 := (x142 & 0xffffffffffffff) + x148, x149 := bits.add_u64(x56, x48, u64(0x0)) + x150, _ := bits.add_u64(x57, x49, u64(fiat.u1(x149))) + x152, x153 := bits.add_u64(x82, x148, u64(0x0)) + x154, _ := bits.add_u64(x83, x150, u64(fiat.u1(x153))) + x156, x157 := bits.add_u64(x94, x152, u64(0x0)) + x158, _ := bits.add_u64(x95, x154, u64(fiat.u1(x157))) + x160, x161 := bits.add_u64(x106, x156, u64(0x0)) + x162, _ := bits.add_u64(x107, x158, u64(fiat.u1(x161))) + x164, x165 := bits.add_u64(x118, x160, u64(0x0)) + x166, _ := bits.add_u64(x119, x162, u64(fiat.u1(x165))) + x168, x169 := bits.add_u64(x38, x30, u64(0x0)) + x170, _ := bits.add_u64(x39, x31, u64(fiat.u1(x169))) + x172, x173 := bits.add_u64(x52, x168, u64(0x0)) + x174, _ := bits.add_u64(x53, x170, u64(fiat.u1(x173))) + x176, x177 := bits.add_u64(x60, x172, u64(0x0)) + x178, _ := bits.add_u64(x61, x174, u64(fiat.u1(x177))) + x180, x181 := bits.add_u64(x72, x176, u64(0x0)) + x182, _ := bits.add_u64(x73, x178, u64(fiat.u1(x181))) + x184, x185 := bits.add_u64(x84, x180, u64(0x0)) + x186, _ := bits.add_u64(x85, x182, u64(fiat.u1(x185))) + x188, x189 := bits.add_u64(x96, x184, u64(0x0)) + x190, _ := bits.add_u64(x97, x186, u64(fiat.u1(x189))) + x192, x193 := bits.add_u64(x108, x188, u64(0x0)) + x194, _ := bits.add_u64(x109, x190, u64(fiat.u1(x193))) + x196, x197 := bits.add_u64(x120, x192, u64(0x0)) + x198, _ := bits.add_u64(x121, x194, u64(fiat.u1(x197))) + x200, x201 := bits.add_u64(x40, x32, u64(0x0)) + x202, _ := bits.add_u64(x41, x33, u64(fiat.u1(x201))) + x204, x205 := bits.add_u64(x64, x200, u64(0x0)) + x206, _ := bits.add_u64(x65, x202, u64(fiat.u1(x205))) + x208, x209 := bits.add_u64(x76, x204, u64(0x0)) + x210, _ := bits.add_u64(x77, x206, u64(fiat.u1(x209))) + x212, x213 := bits.add_u64(x88, x208, u64(0x0)) + x214, _ := bits.add_u64(x89, x210, u64(fiat.u1(x213))) + x216, x217 := bits.add_u64(x98, x212, u64(0x0)) + x218, _ := bits.add_u64(x99, x214, u64(fiat.u1(x217))) + x220, x221 := bits.add_u64(x110, x216, u64(0x0)) + x222, _ := bits.add_u64(x111, x218, u64(fiat.u1(x221))) + x224, x225 := bits.add_u64(x122, x220, u64(0x0)) + x226, _ := bits.add_u64(x123, x222, u64(fiat.u1(x225))) + x228, x229 := bits.add_u64(x36, x34, u64(0x0)) + x230, _ := bits.add_u64(x37, x35, u64(fiat.u1(x229))) + x232, x233 := bits.add_u64(x42, x228, u64(0x0)) + x234, _ := bits.add_u64(x43, x230, u64(fiat.u1(x233))) + x236, x237 := bits.add_u64(x44, x232, u64(0x0)) + x238, _ := bits.add_u64(x45, x234, u64(fiat.u1(x237))) + x240, x241 := bits.add_u64(x68, x236, u64(0x0)) + x242, _ := bits.add_u64(x69, x238, u64(fiat.u1(x241))) + x244, x245 := bits.add_u64(x80, x240, u64(0x0)) + x246, _ := bits.add_u64(x81, x242, u64(fiat.u1(x245))) + x248, x249 := bits.add_u64(x92, x244, u64(0x0)) + x250, _ := bits.add_u64(x93, x246, u64(fiat.u1(x249))) + x252, x253 := bits.add_u64(x100, x248, u64(0x0)) + x254, _ := bits.add_u64(x101, x250, u64(fiat.u1(x253))) + x256, x257 := bits.add_u64(x104, x252, u64(0x0)) + x258, _ := bits.add_u64(x105, x254, u64(fiat.u1(x257))) + x260, x261 := bits.add_u64(x112, x256, u64(0x0)) + x262, _ := bits.add_u64(x113, x258, u64(fiat.u1(x261))) + x264, x265 := bits.add_u64(x124, x260, u64(0x0)) + x266, _ := bits.add_u64(x125, x262, u64(fiat.u1(x265))) + x268, x269 := bits.add_u64(x50, x22, u64(0x0)) + x270, _ := bits.add_u64(x51, x23, u64(fiat.u1(x269))) + x272, x273 := bits.add_u64(x58, x268, u64(0x0)) + x274, _ := bits.add_u64(x59, x270, u64(fiat.u1(x273))) + x276, x277 := bits.add_u64(x70, x272, u64(0x0)) + x278, _ := bits.add_u64(x71, x274, u64(fiat.u1(x277))) + x280, x281 := bits.add_u64(x116, x276, u64(0x0)) + x282, _ := bits.add_u64(x117, x278, u64(fiat.u1(x281))) + x284, x285 := bits.add_u64(x128, x280, u64(0x0)) + x286, _ := bits.add_u64(x129, x282, u64(fiat.u1(x285))) + x288, x289 := bits.add_u64(x62, x24, u64(0x0)) + x290, _ := bits.add_u64(x63, x25, u64(fiat.u1(x289))) + x292, x293 := bits.add_u64(x74, x288, u64(0x0)) + x294, _ := bits.add_u64(x75, x290, u64(fiat.u1(x293))) + x296, x297 := bits.add_u64(x86, x292, u64(0x0)) + x298, _ := bits.add_u64(x87, x294, u64(fiat.u1(x297))) + x300, x301 := bits.add_u64(x130, x296, u64(0x0)) + x302, _ := bits.add_u64(x131, x298, u64(fiat.u1(x301))) + x304, x305 := bits.add_u64(x28, x26, u64(0x0)) + x306, _ := bits.add_u64(x29, x27, u64(fiat.u1(x305))) + x308, x309 := bits.add_u64(x66, x304, u64(0x0)) + x310, _ := bits.add_u64(x67, x306, u64(fiat.u1(x309))) + x312, x313 := bits.add_u64(x78, x308, u64(0x0)) + x314, _ := bits.add_u64(x79, x310, u64(fiat.u1(x313))) + x316, x317 := bits.add_u64(x90, x312, u64(0x0)) + x318, _ := bits.add_u64(x91, x314, u64(fiat.u1(x317))) + x320, x321 := bits.add_u64(x102, x316, u64(0x0)) + x322, _ := bits.add_u64(x103, x318, u64(fiat.u1(x321))) + x324, x325 := bits.add_u64(x132, x320, u64(0x0)) + x326, _ := bits.add_u64(x133, x322, u64(fiat.u1(x325))) + x328, x329 := bits.add_u64(x146, x264, u64(0x0)) + x330 := (u64(fiat.u1(x329)) + x266) + x331 := ((x164 >> 56) | ((x166 << 8) & 0xffffffffffffffff)) + x332 := (x164 & 0xffffffffffffff) + x333, x334 := bits.add_u64(x328, x331, u64(0x0)) + x335 := (u64(fiat.u1(x334)) + x330) + x336 := ((x333 >> 56) | ((x335 << 8) & 0xffffffffffffffff)) + x337 := (x333 & 0xffffffffffffff) + x338, x339 := bits.add_u64(x324, x331, u64(0x0)) + x340 := (u64(fiat.u1(x339)) + x326) + x341, x342 := bits.add_u64(x336, x224, u64(0x0)) + x343 := (u64(fiat.u1(x342)) + x226) + x344 := ((x338 >> 56) | ((x340 << 8) & 0xffffffffffffffff)) + x345 := (x338 & 0xffffffffffffff) + x346, x347 := bits.add_u64(x344, x300, u64(0x0)) + x348 := (u64(fiat.u1(x347)) + x302) + x349 := ((x341 >> 56) | ((x343 << 8) & 0xffffffffffffffff)) + x350 := (x341 & 0xffffffffffffff) + x351, x352 := bits.add_u64(x349, x196, u64(0x0)) + x353 := (u64(fiat.u1(x352)) + x198) + x354 := ((x346 >> 56) | ((x348 << 8) & 0xffffffffffffffff)) + x355 := (x346 & 0xffffffffffffff) + x356, x357 := bits.add_u64(x354, x284, u64(0x0)) + x358 := (u64(fiat.u1(x357)) + x286) + x359 := ((x351 >> 56) | ((x353 << 8) & 0xffffffffffffffff)) + x360 := (x351 & 0xffffffffffffff) + x361 := (x359 + x332) + x362 := ((x356 >> 56) | ((x358 << 8) & 0xffffffffffffffff)) + x363 := (x356 & 0xffffffffffffff) + x364 := (x362 + x147) + x365 := (x361 >> 56) + x366 := (x361 & 0xffffffffffffff) + x367 := (x364 >> 56) + x368 := (x364 & 0xffffffffffffff) + x369 := (x337 + x365) + x370 := (x345 + x365) + x371 := (x367 + x369) + x372 := fiat.u1((x371 >> 56)) + x373 := (x371 & 0xffffffffffffff) + x374 := (u64(x372) + x350) + x375 := fiat.u1((x370 >> 56)) + x376 := (x370 & 0xffffffffffffff) + x377 := (u64(x375) + x355) + out1[0] = x376 + out1[1] = x377 + out1[2] = x363 + out1[3] = x368 + out1[4] = x373 + out1[5] = x374 + out1[6] = x360 + out1[7] = x366 +} + +fe_carry :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) { + x1 := arg1[3] + x2 := arg1[7] + x3 := (x2 >> 56) + x4 := (((x1 >> 56) + arg1[4]) + x3) + x5 := (arg1[0] + x3) + x6 := ((x4 >> 56) + arg1[5]) + x7 := ((x5 >> 56) + arg1[1]) + x8 := ((x6 >> 56) + arg1[6]) + x9 := ((x7 >> 56) + arg1[2]) + x10 := ((x8 >> 56) + (x2 & 0xffffffffffffff)) + x11 := ((x9 >> 56) + (x1 & 0xffffffffffffff)) + x12 := fiat.u1((x10 >> 56)) + x13 := ((x5 & 0xffffffffffffff) + u64(x12)) + x14 := (u64(fiat.u1((x11 >> 56))) + ((x4 & 0xffffffffffffff) + u64(x12))) + x15 := (x13 & 0xffffffffffffff) + x16 := (u64(fiat.u1((x13 >> 56))) + (x7 & 0xffffffffffffff)) + x17 := (x9 & 0xffffffffffffff) + x18 := (x11 & 0xffffffffffffff) + x19 := (x14 & 0xffffffffffffff) + x20 := (u64(fiat.u1((x14 >> 56))) + (x6 & 0xffffffffffffff)) + x21 := (x8 & 0xffffffffffffff) + x22 := (x10 & 0xffffffffffffff) + out1[0] = x15 + out1[1] = x16 + out1[2] = x17 + out1[3] = x18 + out1[4] = x19 + out1[5] = x20 + out1[6] = x21 + out1[7] = x22 +} + +fe_add :: proc "contextless" (out1: ^Loose_Field_Element, arg1, arg2: ^Tight_Field_Element) { + x1 := (arg1[0] + arg2[0]) + x2 := (arg1[1] + arg2[1]) + x3 := (arg1[2] + arg2[2]) + x4 := (arg1[3] + arg2[3]) + x5 := (arg1[4] + arg2[4]) + x6 := (arg1[5] + arg2[5]) + x7 := (arg1[6] + arg2[6]) + x8 := (arg1[7] + arg2[7]) + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 + out1[3] = x4 + out1[4] = x5 + out1[5] = x6 + out1[6] = x7 + out1[7] = x8 +} + +fe_sub :: proc "contextless" (out1: ^Loose_Field_Element, arg1, arg2: ^Tight_Field_Element) { + x1 := ((0x1fffffffffffffe + arg1[0]) - arg2[0]) + x2 := ((0x1fffffffffffffe + arg1[1]) - arg2[1]) + x3 := ((0x1fffffffffffffe + arg1[2]) - arg2[2]) + x4 := ((0x1fffffffffffffe + arg1[3]) - arg2[3]) + x5 := ((0x1fffffffffffffc + arg1[4]) - arg2[4]) + x6 := ((0x1fffffffffffffe + arg1[5]) - arg2[5]) + x7 := ((0x1fffffffffffffe + arg1[6]) - arg2[6]) + x8 := ((0x1fffffffffffffe + arg1[7]) - arg2[7]) + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 + out1[3] = x4 + out1[4] = x5 + out1[5] = x6 + out1[6] = x7 + out1[7] = x8 +} + +fe_opp :: proc "contextless" (out1: ^Loose_Field_Element, arg1: ^Tight_Field_Element) { + x1 := (0x1fffffffffffffe - arg1[0]) + x2 := (0x1fffffffffffffe - arg1[1]) + x3 := (0x1fffffffffffffe - arg1[2]) + x4 := (0x1fffffffffffffe - arg1[3]) + x5 := (0x1fffffffffffffc - arg1[4]) + x6 := (0x1fffffffffffffe - arg1[5]) + x7 := (0x1fffffffffffffe - arg1[6]) + x8 := (0x1fffffffffffffe - arg1[7]) + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 + out1[3] = x4 + out1[4] = x5 + out1[5] = x6 + out1[6] = x7 + out1[7] = x8 +} + +@(optimization_mode = "none") +fe_cond_assign :: #force_no_inline proc "contextless" ( + out1, arg1: ^Tight_Field_Element, + arg2: int, +) { + x1 := fiat.cmovznz_u64(fiat.u1(arg2), out1[0], arg1[0]) + x2 := fiat.cmovznz_u64(fiat.u1(arg2), out1[1], arg1[1]) + x3 := fiat.cmovznz_u64(fiat.u1(arg2), out1[2], arg1[2]) + x4 := fiat.cmovznz_u64(fiat.u1(arg2), out1[3], arg1[3]) + x5 := fiat.cmovznz_u64(fiat.u1(arg2), out1[4], arg1[4]) + x6 := fiat.cmovznz_u64(fiat.u1(arg2), out1[5], arg1[5]) + x7 := fiat.cmovznz_u64(fiat.u1(arg2), out1[6], arg1[6]) + x8 := fiat.cmovznz_u64(fiat.u1(arg2), out1[7], arg1[7]) + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 + out1[3] = x4 + out1[4] = x5 + out1[5] = x6 + out1[6] = x7 + out1[7] = x8 +} + +fe_to_bytes :: proc "contextless" (out1: ^[56]byte, arg1: ^Tight_Field_Element) { + x1, x2 := _subborrowx_u56(0x0, arg1[0], 0xffffffffffffff) + x3, x4 := _subborrowx_u56(x2, arg1[1], 0xffffffffffffff) + x5, x6 := _subborrowx_u56(x4, arg1[2], 0xffffffffffffff) + x7, x8 := _subborrowx_u56(x6, arg1[3], 0xffffffffffffff) + x9, x10 := _subborrowx_u56(x8, arg1[4], 0xfffffffffffffe) + x11, x12 := _subborrowx_u56(x10, arg1[5], 0xffffffffffffff) + x13, x14 := _subborrowx_u56(x12, arg1[6], 0xffffffffffffff) + x15, x16 := _subborrowx_u56(x14, arg1[7], 0xffffffffffffff) + x17 := fiat.cmovznz_u64(x16, u64(0x0), 0xffffffffffffffff) + x18, x19 := _addcarryx_u56(0x0, x1, (x17 & 0xffffffffffffff)) + x20, x21 := _addcarryx_u56(x19, x3, (x17 & 0xffffffffffffff)) + x22, x23 := _addcarryx_u56(x21, x5, (x17 & 0xffffffffffffff)) + x24, x25 := _addcarryx_u56(x23, x7, (x17 & 0xffffffffffffff)) + x26, x27 := _addcarryx_u56(x25, x9, (x17 & 0xfffffffffffffe)) + x28, x29 := _addcarryx_u56(x27, x11, (x17 & 0xffffffffffffff)) + x30, x31 := _addcarryx_u56(x29, x13, (x17 & 0xffffffffffffff)) + x32, _ := _addcarryx_u56(x31, x15, (x17 & 0xffffffffffffff)) + x34 := (u8(x18) & 0xff) + x35 := (x18 >> 8) + x36 := (u8(x35) & 0xff) + x37 := (x35 >> 8) + x38 := (u8(x37) & 0xff) + x39 := (x37 >> 8) + x40 := (u8(x39) & 0xff) + x41 := (x39 >> 8) + x42 := (u8(x41) & 0xff) + x43 := (x41 >> 8) + x44 := (u8(x43) & 0xff) + x45 := u8((x43 >> 8)) + x46 := (u8(x20) & 0xff) + x47 := (x20 >> 8) + x48 := (u8(x47) & 0xff) + x49 := (x47 >> 8) + x50 := (u8(x49) & 0xff) + x51 := (x49 >> 8) + x52 := (u8(x51) & 0xff) + x53 := (x51 >> 8) + x54 := (u8(x53) & 0xff) + x55 := (x53 >> 8) + x56 := (u8(x55) & 0xff) + x57 := u8((x55 >> 8)) + x58 := (u8(x22) & 0xff) + x59 := (x22 >> 8) + x60 := (u8(x59) & 0xff) + x61 := (x59 >> 8) + x62 := (u8(x61) & 0xff) + x63 := (x61 >> 8) + x64 := (u8(x63) & 0xff) + x65 := (x63 >> 8) + x66 := (u8(x65) & 0xff) + x67 := (x65 >> 8) + x68 := (u8(x67) & 0xff) + x69 := u8((x67 >> 8)) + x70 := (u8(x24) & 0xff) + x71 := (x24 >> 8) + x72 := (u8(x71) & 0xff) + x73 := (x71 >> 8) + x74 := (u8(x73) & 0xff) + x75 := (x73 >> 8) + x76 := (u8(x75) & 0xff) + x77 := (x75 >> 8) + x78 := (u8(x77) & 0xff) + x79 := (x77 >> 8) + x80 := (u8(x79) & 0xff) + x81 := u8((x79 >> 8)) + x82 := (u8(x26) & 0xff) + x83 := (x26 >> 8) + x84 := (u8(x83) & 0xff) + x85 := (x83 >> 8) + x86 := (u8(x85) & 0xff) + x87 := (x85 >> 8) + x88 := (u8(x87) & 0xff) + x89 := (x87 >> 8) + x90 := (u8(x89) & 0xff) + x91 := (x89 >> 8) + x92 := (u8(x91) & 0xff) + x93 := u8((x91 >> 8)) + x94 := (u8(x28) & 0xff) + x95 := (x28 >> 8) + x96 := (u8(x95) & 0xff) + x97 := (x95 >> 8) + x98 := (u8(x97) & 0xff) + x99 := (x97 >> 8) + x100 := (u8(x99) & 0xff) + x101 := (x99 >> 8) + x102 := (u8(x101) & 0xff) + x103 := (x101 >> 8) + x104 := (u8(x103) & 0xff) + x105 := u8((x103 >> 8)) + x106 := (u8(x30) & 0xff) + x107 := (x30 >> 8) + x108 := (u8(x107) & 0xff) + x109 := (x107 >> 8) + x110 := (u8(x109) & 0xff) + x111 := (x109 >> 8) + x112 := (u8(x111) & 0xff) + x113 := (x111 >> 8) + x114 := (u8(x113) & 0xff) + x115 := (x113 >> 8) + x116 := (u8(x115) & 0xff) + x117 := u8((x115 >> 8)) + x118 := (u8(x32) & 0xff) + x119 := (x32 >> 8) + x120 := (u8(x119) & 0xff) + x121 := (x119 >> 8) + x122 := (u8(x121) & 0xff) + x123 := (x121 >> 8) + x124 := (u8(x123) & 0xff) + x125 := (x123 >> 8) + x126 := (u8(x125) & 0xff) + x127 := (x125 >> 8) + x128 := (u8(x127) & 0xff) + x129 := u8((x127 >> 8)) + out1[0] = x34 + out1[1] = x36 + out1[2] = x38 + out1[3] = x40 + out1[4] = x42 + out1[5] = x44 + out1[6] = x45 + out1[7] = x46 + out1[8] = x48 + out1[9] = x50 + out1[10] = x52 + out1[11] = x54 + out1[12] = x56 + out1[13] = x57 + out1[14] = x58 + out1[15] = x60 + out1[16] = x62 + out1[17] = x64 + out1[18] = x66 + out1[19] = x68 + out1[20] = x69 + out1[21] = x70 + out1[22] = x72 + out1[23] = x74 + out1[24] = x76 + out1[25] = x78 + out1[26] = x80 + out1[27] = x81 + out1[28] = x82 + out1[29] = x84 + out1[30] = x86 + out1[31] = x88 + out1[32] = x90 + out1[33] = x92 + out1[34] = x93 + out1[35] = x94 + out1[36] = x96 + out1[37] = x98 + out1[38] = x100 + out1[39] = x102 + out1[40] = x104 + out1[41] = x105 + out1[42] = x106 + out1[43] = x108 + out1[44] = x110 + out1[45] = x112 + out1[46] = x114 + out1[47] = x116 + out1[48] = x117 + out1[49] = x118 + out1[50] = x120 + out1[51] = x122 + out1[52] = x124 + out1[53] = x126 + out1[54] = x128 + out1[55] = x129 +} + +fe_from_bytes :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^[56]byte) { + x1 := (u64(arg1[55]) << 48) + x2 := (u64(arg1[54]) << 40) + x3 := (u64(arg1[53]) << 32) + x4 := (u64(arg1[52]) << 24) + x5 := (u64(arg1[51]) << 16) + x6 := (u64(arg1[50]) << 8) + x7 := arg1[49] + x8 := (u64(arg1[48]) << 48) + x9 := (u64(arg1[47]) << 40) + x10 := (u64(arg1[46]) << 32) + x11 := (u64(arg1[45]) << 24) + x12 := (u64(arg1[44]) << 16) + x13 := (u64(arg1[43]) << 8) + x14 := arg1[42] + x15 := (u64(arg1[41]) << 48) + x16 := (u64(arg1[40]) << 40) + x17 := (u64(arg1[39]) << 32) + x18 := (u64(arg1[38]) << 24) + x19 := (u64(arg1[37]) << 16) + x20 := (u64(arg1[36]) << 8) + x21 := arg1[35] + x22 := (u64(arg1[34]) << 48) + x23 := (u64(arg1[33]) << 40) + x24 := (u64(arg1[32]) << 32) + x25 := (u64(arg1[31]) << 24) + x26 := (u64(arg1[30]) << 16) + x27 := (u64(arg1[29]) << 8) + x28 := arg1[28] + x29 := (u64(arg1[27]) << 48) + x30 := (u64(arg1[26]) << 40) + x31 := (u64(arg1[25]) << 32) + x32 := (u64(arg1[24]) << 24) + x33 := (u64(arg1[23]) << 16) + x34 := (u64(arg1[22]) << 8) + x35 := arg1[21] + x36 := (u64(arg1[20]) << 48) + x37 := (u64(arg1[19]) << 40) + x38 := (u64(arg1[18]) << 32) + x39 := (u64(arg1[17]) << 24) + x40 := (u64(arg1[16]) << 16) + x41 := (u64(arg1[15]) << 8) + x42 := arg1[14] + x43 := (u64(arg1[13]) << 48) + x44 := (u64(arg1[12]) << 40) + x45 := (u64(arg1[11]) << 32) + x46 := (u64(arg1[10]) << 24) + x47 := (u64(arg1[9]) << 16) + x48 := (u64(arg1[8]) << 8) + x49 := arg1[7] + x50 := (u64(arg1[6]) << 48) + x51 := (u64(arg1[5]) << 40) + x52 := (u64(arg1[4]) << 32) + x53 := (u64(arg1[3]) << 24) + x54 := (u64(arg1[2]) << 16) + x55 := (u64(arg1[1]) << 8) + x56 := arg1[0] + x57 := (x55 + u64(x56)) + x58 := (x54 + x57) + x59 := (x53 + x58) + x60 := (x52 + x59) + x61 := (x51 + x60) + x62 := (x50 + x61) + x63 := (x48 + u64(x49)) + x64 := (x47 + x63) + x65 := (x46 + x64) + x66 := (x45 + x65) + x67 := (x44 + x66) + x68 := (x43 + x67) + x69 := (x41 + u64(x42)) + x70 := (x40 + x69) + x71 := (x39 + x70) + x72 := (x38 + x71) + x73 := (x37 + x72) + x74 := (x36 + x73) + x75 := (x34 + u64(x35)) + x76 := (x33 + x75) + x77 := (x32 + x76) + x78 := (x31 + x77) + x79 := (x30 + x78) + x80 := (x29 + x79) + x81 := (x27 + u64(x28)) + x82 := (x26 + x81) + x83 := (x25 + x82) + x84 := (x24 + x83) + x85 := (x23 + x84) + x86 := (x22 + x85) + x87 := (x20 + u64(x21)) + x88 := (x19 + x87) + x89 := (x18 + x88) + x90 := (x17 + x89) + x91 := (x16 + x90) + x92 := (x15 + x91) + x93 := (x13 + u64(x14)) + x94 := (x12 + x93) + x95 := (x11 + x94) + x96 := (x10 + x95) + x97 := (x9 + x96) + x98 := (x8 + x97) + x99 := (x6 + u64(x7)) + x100 := (x5 + x99) + x101 := (x4 + x100) + x102 := (x3 + x101) + x103 := (x2 + x102) + x104 := (x1 + x103) + out1[0] = x62 + out1[1] = x68 + out1[2] = x74 + out1[3] = x80 + out1[4] = x86 + out1[5] = x92 + out1[6] = x98 + out1[7] = x104 +} + +fe_relax :: proc "contextless" (out1: ^Loose_Field_Element, arg1: ^Tight_Field_Element) { + x1 := arg1[0] + x2 := arg1[1] + x3 := arg1[2] + x4 := arg1[3] + x5 := arg1[4] + x6 := arg1[5] + x7 := arg1[6] + x8 := arg1[7] + out1[0] = x1 + out1[1] = x2 + out1[2] = x3 + out1[3] = x4 + out1[4] = x5 + out1[5] = x6 + out1[6] = x7 + out1[7] = x8 +} diff --git a/core/crypto/_fiat/field_poly1305/field.odin b/core/crypto/_fiat/field_poly1305/field.odin index b12046858..caaece98e 100644 --- a/core/crypto/_fiat/field_poly1305/field.odin +++ b/core/crypto/_fiat/field_poly1305/field.odin @@ -1,6 +1,5 @@ package field_poly1305 -import "base:intrinsics" import "core:encoding/endian" import "core:mem" @@ -29,9 +28,7 @@ fe_from_bytes :: #force_inline proc "contextless" ( // makes implementing the actual MAC block processing considerably // neater. - if len(arg1) != 16 { - intrinsics.trap() - } + ensure_contextless(len(arg1) == 16, "poly1305: invalid field element size") // While it may be unwise to do deserialization here on our // own when fiat-crypto provides equivalent functionality, diff --git a/core/crypto/_fiat/field_scalar25519/field.odin b/core/crypto/_fiat/field_scalar25519/field.odin index 9b40661b7..933637c54 100644 --- a/core/crypto/_fiat/field_scalar25519/field.odin +++ b/core/crypto/_fiat/field_scalar25519/field.odin @@ -1,18 +1,17 @@ package field_scalar25519 -import "base:intrinsics" import "core:encoding/endian" import "core:math/bits" import "core:mem" -@(private) +@(private, rodata) _TWO_168 := Montgomery_Domain_Field_Element { 0x5b8ab432eac74798, 0x38afddd6de59d5d7, 0xa2c131b399411b7c, 0x6329a7ed9ce5a30, } -@(private) +@(private, rodata) _TWO_336 := Montgomery_Domain_Field_Element { 0xbd3d108e2b35ecc5, 0x5c3a3718bdf9c90b, @@ -95,9 +94,8 @@ fe_from_bytes_wide :: proc "contextless" ( @(private) _fe_from_bytes_short :: proc "contextless" (out1: ^Montgomery_Domain_Field_Element, arg1: []byte) { // INVARIANT: len(arg1) < 32. - if len(arg1) >= 32 { - intrinsics.trap() - } + ensure_contextless(len(arg1) < 32, "edwards25519: oversized short scalar") + tmp: [32]byte copy(tmp[:], arg1) @@ -106,9 +104,7 @@ _fe_from_bytes_short :: proc "contextless" (out1: ^Montgomery_Domain_Field_Eleme } fe_to_bytes :: proc "contextless" (out1: []byte, arg1: ^Montgomery_Domain_Field_Element) { - if len(out1) != 32 { - intrinsics.trap() - } + ensure_contextless(len(out1) == 32, "edwards25519: oversized scalar output buffer") tmp: Non_Montgomery_Domain_Field_Element fe_from_montgomery(&tmp, arg1) diff --git a/core/crypto/_sha3/sha3.odin b/core/crypto/_sha3/sha3.odin index 2db76fce0..52b3fbda9 100644 --- a/core/crypto/_sha3/sha3.odin +++ b/core/crypto/_sha3/sha3.odin @@ -44,7 +44,7 @@ Context :: struct { is_finalized: bool, // For SHAKE (unlimited squeeze is allowed) } -@(private) +@(private, rodata) keccakf_rndc := [?]u64 { 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, @@ -56,13 +56,13 @@ keccakf_rndc := [?]u64 { 0x8000000000008080, 0x0000000080000001, 0x8000000080008008, } -@(private) +@(private, rodata) keccakf_rotc := [?]int { 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44, } -@(private) +@(private, rodata) keccakf_piln := [?]i32 { 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1, @@ -122,7 +122,7 @@ keccakf :: proc "contextless" (st: ^[25]u64) { } } -init :: proc(ctx: ^Context) { +init :: proc "contextless" (ctx: ^Context) { for i := 0; i < 25; i += 1 { ctx.st.q[i] = 0 } @@ -133,9 +133,9 @@ init :: proc(ctx: ^Context) { ctx.is_finalized = false } -update :: proc(ctx: ^Context, data: []byte) { - assert(ctx.is_initialized) - assert(!ctx.is_finalized) +update :: proc "contextless" (ctx: ^Context, data: []byte) { + ensure_contextless(ctx.is_initialized) + ensure_contextless(!ctx.is_finalized) j := ctx.pt for i := 0; i < len(data); i += 1 { @@ -149,12 +149,9 @@ update :: proc(ctx: ^Context, data: []byte) { ctx.pt = j } -final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) { - assert(ctx.is_initialized) - - if len(hash) < ctx.mdlen { - panic("crypto/sha3: invalid destination digest size") - } +final :: proc "contextless" (ctx: ^Context, hash: []byte, finalize_clone: bool = false) { + ensure_contextless(ctx.is_initialized) + ensure_contextless(len(hash) >= ctx.mdlen, "crypto/sha3: invalid destination digest size") ctx := ctx if finalize_clone { @@ -173,11 +170,11 @@ final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) { } } -clone :: proc(ctx, other: ^Context) { +clone :: proc "contextless" (ctx, other: ^Context) { ctx^ = other^ } -reset :: proc(ctx: ^Context) { +reset :: proc "contextless" (ctx: ^Context) { if !ctx.is_initialized { return } @@ -185,9 +182,9 @@ reset :: proc(ctx: ^Context) { mem.zero_explicit(ctx, size_of(ctx^)) } -shake_xof :: proc(ctx: ^Context) { - assert(ctx.is_initialized) - assert(!ctx.is_finalized) +shake_xof :: proc "contextless" (ctx: ^Context) { + ensure_contextless(ctx.is_initialized) + ensure_contextless(!ctx.is_finalized) ctx.st.b[ctx.pt] ~= ctx.dsbyte ctx.st.b[ctx.rsiz - 1] ~= 0x80 @@ -197,9 +194,9 @@ shake_xof :: proc(ctx: ^Context) { ctx.is_finalized = true // No more absorb, unlimited squeeze. } -shake_out :: proc(ctx: ^Context, hash: []byte) { - assert(ctx.is_initialized) - assert(ctx.is_finalized) +shake_out :: proc "contextless" (ctx: ^Context, hash: []byte) { + ensure_contextless(ctx.is_initialized) + ensure_contextless(ctx.is_finalized) j := ctx.pt for i := 0; i < len(hash); i += 1 { diff --git a/core/crypto/_sha3/sp800_185.odin b/core/crypto/_sha3/sp800_185.odin index a96f78cc1..8390d8490 100644 --- a/core/crypto/_sha3/sp800_185.odin +++ b/core/crypto/_sha3/sp800_185.odin @@ -3,7 +3,7 @@ package _sha3 import "core:encoding/endian" import "core:math/bits" -init_cshake :: proc(ctx: ^Context, n, s: []byte, sec_strength: int) { +init_cshake :: proc "contextless" (ctx: ^Context, n, s: []byte, sec_strength: int) { ctx.mdlen = sec_strength / 8 // No domain separator is equivalent to vanilla SHAKE. @@ -18,7 +18,7 @@ init_cshake :: proc(ctx: ^Context, n, s: []byte, sec_strength: int) { bytepad(ctx, [][]byte{n, s}, rate_cshake(sec_strength)) } -final_cshake :: proc(ctx: ^Context, dst: []byte, finalize_clone: bool = false) { +final_cshake :: proc "contextless" (ctx: ^Context, dst: []byte, finalize_clone: bool = false) { ctx := ctx if finalize_clone { tmp_ctx: Context @@ -32,7 +32,7 @@ final_cshake :: proc(ctx: ^Context, dst: []byte, finalize_clone: bool = false) { shake_out(ctx, dst) } -rate_cshake :: #force_inline proc(sec_strength: int) -> int { +rate_cshake :: #force_inline proc "contextless" (sec_strength: int) -> int { switch sec_strength { case 128: return RATE_128 @@ -40,7 +40,7 @@ rate_cshake :: #force_inline proc(sec_strength: int) -> int { return RATE_256 } - panic("crypto/sha3: invalid security strength") + panic_contextless("crypto/sha3: invalid security strength") } // right_encode and left_encode are defined to support 0 <= x < 2^2040 @@ -52,10 +52,10 @@ rate_cshake :: #force_inline proc(sec_strength: int) -> int { // // Thus we support 0 <= x < 2^128. -@(private) +@(private, rodata) _PAD: [RATE_128]byte // Biggest possible value of w per spec. -bytepad :: proc(ctx: ^Context, x_strings: [][]byte, w: int) { +bytepad :: proc "contextless" (ctx: ^Context, x_strings: [][]byte, w: int) { // 1. z = left_encode(w) || X. z_hi: u64 z_lo := left_right_encode(ctx, 0, u64(w), true) @@ -70,9 +70,7 @@ bytepad :: proc(ctx: ^Context, x_strings: [][]byte, w: int) { // This isn't actually possible, at least with the currently // defined SP 800-185 routines. - if carry != 0 { - panic("crypto/sha3: bytepad input length overflow") - } + ensure_contextless(carry == 0, "crypto/sha3: bytepad input length overflow") } // We skip this step as we are doing a byte-oriented implementation @@ -95,7 +93,7 @@ bytepad :: proc(ctx: ^Context, x_strings: [][]byte, w: int) { } } -encode_string :: #force_inline proc(ctx: ^Context, s: []byte) -> (u64, u64) { +encode_string :: #force_inline proc "contextless" (ctx: ^Context, s: []byte) -> (u64, u64) { l := encode_byte_len(ctx, len(s), true) // left_encode update(ctx, s) @@ -104,13 +102,13 @@ encode_string :: #force_inline proc(ctx: ^Context, s: []byte) -> (u64, u64) { return hi, lo } -encode_byte_len :: #force_inline proc(ctx: ^Context, l: int, is_left: bool) -> u64 { +encode_byte_len :: #force_inline proc "contextless" (ctx: ^Context, l: int, is_left: bool) -> u64 { hi, lo := bits.mul_u64(u64(l), 8) return left_right_encode(ctx, hi, lo, is_left) } @(private) -left_right_encode :: proc(ctx: ^Context, hi, lo: u64, is_left: bool) -> u64 { +left_right_encode :: proc "contextless" (ctx: ^Context, hi, lo: u64, is_left: bool) -> u64 { HI_OFFSET :: 1 LO_OFFSET :: HI_OFFSET + 8 RIGHT_OFFSET :: LO_OFFSET + 8 diff --git a/core/crypto/aead/aead.odin b/core/crypto/aead/aead.odin index 9b7d810e4..c8f324929 100644 --- a/core/crypto/aead/aead.odin +++ b/core/crypto/aead/aead.odin @@ -16,7 +16,7 @@ seal_oneshot :: proc(algo: Algorithm, dst, tag, key, iv, aad, plaintext: []byte, // returning true iff the authentication was successful. If authentication // fails, the destination buffer will be zeroed. // -// dst and plaintext MUST alias exactly or not at all. +// dst and ciphertext MUST alias exactly or not at all. @(require_results) open_oneshot :: proc(algo: Algorithm, dst, key, iv, aad, ciphertext, tag: []byte, impl: Implementation = nil) -> bool { ctx: Context diff --git a/core/crypto/aead/low_level.odin b/core/crypto/aead/low_level.odin index 38a0c84ba..c80574a0d 100644 --- a/core/crypto/aead/low_level.odin +++ b/core/crypto/aead/low_level.odin @@ -1,8 +1,10 @@ package aead +import "core:crypto/aegis" import "core:crypto/aes" import "core:crypto/chacha20" import "core:crypto/chacha20poly1305" +import "core:crypto/deoxysii" import "core:reflect" // Implementation is an AEAD implementation. Most callers will not need @@ -15,7 +17,7 @@ Implementation :: union { // MAX_TAG_SIZE is the maximum size tag that can be returned by any of the // Algorithms supported via this package. -MAX_TAG_SIZE :: 16 +MAX_TAG_SIZE :: 32 // Algorithm is the algorithm identifier associated with a given Context. Algorithm :: enum { @@ -25,9 +27,14 @@ Algorithm :: enum { AES_GCM_256, CHACHA20POLY1305, XCHACHA20POLY1305, + AEGIS_128L, + AEGIS_128L_256, // AEGIS-128L (256-bit tag) + AEGIS_256, + AEGIS_256_256, // AEGIS-256 (256-bit tag) + DEOXYS_II_256, } -// ALGORITM_NAMES is the Agorithm to algorithm name string. +// ALGORITM_NAMES is the Algorithm to algorithm name string. ALGORITHM_NAMES := [Algorithm]string { .Invalid = "Invalid", .AES_GCM_128 = "AES-GCM-128", @@ -35,6 +42,11 @@ ALGORITHM_NAMES := [Algorithm]string { .AES_GCM_256 = "AES-GCM-256", .CHACHA20POLY1305 = "chacha20poly1305", .XCHACHA20POLY1305 = "xchacha20poly1305", + .AEGIS_128L = "AEGIS-128L", + .AEGIS_128L_256 = "AEGIS-128L-256", + .AEGIS_256 = "AEGIS-256", + .AEGIS_256_256 = "AEGIS-256-256", + .DEOXYS_II_256 = "Deoxys-II-256", } // TAG_SIZES is the Algorithm to tag size in bytes. @@ -45,6 +57,11 @@ TAG_SIZES := [Algorithm]int { .AES_GCM_256 = aes.GCM_TAG_SIZE, .CHACHA20POLY1305 = chacha20poly1305.TAG_SIZE, .XCHACHA20POLY1305 = chacha20poly1305.TAG_SIZE, + .AEGIS_128L = aegis.TAG_SIZE_128, + .AEGIS_128L_256 = aegis.TAG_SIZE_256, + .AEGIS_256 = aegis.TAG_SIZE_128, + .AEGIS_256_256 = aegis.TAG_SIZE_256, + .DEOXYS_II_256 = deoxysii.TAG_SIZE, } // KEY_SIZES is the Algorithm to key size in bytes. @@ -55,6 +72,11 @@ KEY_SIZES := [Algorithm]int { .AES_GCM_256 = aes.KEY_SIZE_256, .CHACHA20POLY1305 = chacha20poly1305.KEY_SIZE, .XCHACHA20POLY1305 = chacha20poly1305.KEY_SIZE, + .AEGIS_128L = aegis.KEY_SIZE_128L, + .AEGIS_128L_256 = aegis.KEY_SIZE_128L, + .AEGIS_256 = aegis.KEY_SIZE_256, + .AEGIS_256_256 = aegis.KEY_SIZE_256, + .DEOXYS_II_256 = deoxysii.KEY_SIZE, } // IV_SIZES is the Algorithm to initialization vector size in bytes. @@ -67,6 +89,11 @@ IV_SIZES := [Algorithm]int { .AES_GCM_256 = aes.GCM_IV_SIZE, .CHACHA20POLY1305 = chacha20poly1305.IV_SIZE, .XCHACHA20POLY1305 = chacha20poly1305.XIV_SIZE, + .AEGIS_128L = aegis.IV_SIZE_128L, + .AEGIS_128L_256 = aegis.IV_SIZE_128L, + .AEGIS_256 = aegis.IV_SIZE_256, + .AEGIS_256_256 = aegis.IV_SIZE_256, + .DEOXYS_II_256 = deoxysii.IV_SIZE, } // Context is a concrete instantiation of a specific AEAD algorithm. @@ -75,6 +102,8 @@ Context :: struct { _impl: union { aes.Context_GCM, chacha20poly1305.Context, + aegis.Context, + deoxysii.Context, }, } @@ -86,6 +115,11 @@ _IMPL_IDS := [Algorithm]typeid { .AES_GCM_256 = typeid_of(aes.Context_GCM), .CHACHA20POLY1305 = typeid_of(chacha20poly1305.Context), .XCHACHA20POLY1305 = typeid_of(chacha20poly1305.Context), + .AEGIS_128L = typeid_of(aegis.Context), + .AEGIS_128L_256 = typeid_of(aegis.Context), + .AEGIS_256 = typeid_of(aegis.Context), + .AEGIS_256_256 = typeid_of(aegis.Context), + .DEOXYS_II_256 = typeid_of(deoxysii.Context), } // init initializes a Context with a specific AEAD Algorithm. @@ -94,9 +128,7 @@ init :: proc(ctx: ^Context, algorithm: Algorithm, key: []byte, impl: Implementat reset(ctx) } - if len(key) != KEY_SIZES[algorithm] { - panic("crypto/aead: invalid key size") - } + ensure(len(key) == KEY_SIZES[algorithm], "crypto/aead: invalid key size") // Directly specialize the union by setting the type ID (save a copy). reflect.set_union_variant_typeid( @@ -113,6 +145,12 @@ init :: proc(ctx: ^Context, algorithm: Algorithm, key: []byte, impl: Implementat case .XCHACHA20POLY1305: impl_ := impl != nil ? impl.(chacha20.Implementation) : chacha20.DEFAULT_IMPLEMENTATION chacha20poly1305.init_xchacha(&ctx._impl.(chacha20poly1305.Context), key, impl_) + case .AEGIS_128L, .AEGIS_128L_256, .AEGIS_256, .AEGIS_256_256: + impl_ := impl != nil ? impl.(aes.Implementation) : aes.DEFAULT_IMPLEMENTATION + aegis.init(&ctx._impl.(aegis.Context), key, impl_) + case .DEOXYS_II_256: + impl_ := impl != nil ? impl.(aes.Implementation) : aes.DEFAULT_IMPLEMENTATION + deoxysii.init(&ctx._impl.(deoxysii.Context), key, impl_) case .Invalid: panic("crypto/aead: uninitialized algorithm") case: @@ -127,11 +165,17 @@ init :: proc(ctx: ^Context, algorithm: Algorithm, key: []byte, impl: Implementat // // dst and plaintext MUST alias exactly or not at all. seal_ctx :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) { + ensure(len(tag) == TAG_SIZES[ctx._algo], "crypto/aead: invalid tag size") + switch &impl in ctx._impl { case aes.Context_GCM: aes.seal_gcm(&impl, dst, tag, iv, aad, plaintext) case chacha20poly1305.Context: chacha20poly1305.seal(&impl, dst, tag, iv, aad, plaintext) + case aegis.Context: + aegis.seal(&impl, dst, tag, iv, aad, plaintext) + case deoxysii.Context: + deoxysii.seal(&impl, dst, tag, iv, aad, plaintext) case: panic("crypto/aead: uninitialized algorithm") } @@ -145,11 +189,17 @@ seal_ctx :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) { // dst and plaintext MUST alias exactly or not at all. @(require_results) open_ctx :: proc(ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool { + ensure(len(tag) == TAG_SIZES[ctx._algo], "crypto/aead: invalid tag size") + switch &impl in ctx._impl { case aes.Context_GCM: return aes.open_gcm(&impl, dst, iv, aad, ciphertext, tag) case chacha20poly1305.Context: return chacha20poly1305.open(&impl, dst, iv, aad, ciphertext, tag) + case aegis.Context: + return aegis.open(&impl, dst, iv, aad, ciphertext, tag) + case deoxysii.Context: + return deoxysii.open(&impl, dst, iv, aad, ciphertext, tag) case: panic("crypto/aead: uninitialized algorithm") } @@ -163,6 +213,10 @@ reset :: proc(ctx: ^Context) { aes.reset_gcm(&impl) case chacha20poly1305.Context: chacha20poly1305.reset(&impl) + case aegis.Context: + aegis.reset(&impl) + case deoxysii.Context: + deoxysii.reset(&impl) case: // Calling reset repeatedly is fine. } diff --git a/core/crypto/aegis/aegis.odin b/core/crypto/aegis/aegis.odin new file mode 100644 index 000000000..adecce91f --- /dev/null +++ b/core/crypto/aegis/aegis.odin @@ -0,0 +1,213 @@ +/* +package aegis implements the AEGIS-128L and AEGIS-256 Authenticated +Encryption with Additional Data algorithms. + +See: +- [[ https://www.ietf.org/archive/id/draft-irtf-cfrg-aegis-aead-12.txt ]] +*/ +package aegis + +import "core:bytes" +import "core:crypto" +import "core:crypto/aes" +import "core:mem" + +// KEY_SIZE_128L is the AEGIS-128L key size in bytes. +KEY_SIZE_128L :: 16 +// KEY_SIZE_256 is the AEGIS-256 key size in bytes. +KEY_SIZE_256 :: 32 +// IV_SIZE_128L is the AEGIS-128L IV size in bytes. +IV_SIZE_128L :: 16 +// IV_SIZE_256 is the AEGIS-256 IV size in bytes. +IV_SIZE_256 :: 32 +// TAG_SIZE_128 is the AEGIS-128L or AEGIS-256 128-bit tag size in bytes. +TAG_SIZE_128 :: 16 +// TAG_SIZE_256 is the AEGIS-128L or AEGIS-256 256-bit tag size in bytes. +TAG_SIZE_256 :: 32 + +@(private) +_RATE_128L :: 32 +@(private) +_RATE_256 :: 16 +@(private) +_RATE_MAX :: _RATE_128L + +@(private, rodata) +_C0 := [16]byte{ + 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d, + 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62, +} + +@(private, rodata) +_C1 := [16]byte { + 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1, + 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd, +} + +// Context is a keyed AEGIS-128L or AEGIS-256 instance. +Context :: struct { + _key: [KEY_SIZE_256]byte, + _key_len: int, + _impl: aes.Implementation, + _is_initialized: bool, +} + +@(private) +_validate_common_slice_sizes :: proc (ctx: ^Context, tag, iv, aad, text: []byte) { + switch len(tag) { + case TAG_SIZE_128, TAG_SIZE_256: + case: + panic("crypto/aegis: invalid tag size") + } + + iv_ok: bool + switch ctx._key_len { + case KEY_SIZE_128L: + iv_ok = len(iv) == IV_SIZE_128L + case KEY_SIZE_256: + iv_ok = len(iv) == IV_SIZE_256 + } + ensure(iv_ok,"crypto/aegis: invalid IV size") + + #assert(size_of(int) == 8 || size_of(int) <= 4) + // As A_MAX and P_MAX are both defined to be 2^61 - 1 bytes, and + // the maximum length of a slice is bound by `size_of(int)`, where + // `int` is register sized, there is no need to check AAD/text + // lengths. +} + +// init initializes a Context with the provided key, for AEGIS-128L or AEGIS-256. +init :: proc(ctx: ^Context, key: []byte, impl := aes.DEFAULT_IMPLEMENTATION) { + switch len(key) { + case KEY_SIZE_128L, KEY_SIZE_256: + case: + panic("crypto/aegis: invalid key size") + } + + copy(ctx._key[:], key) + ctx._key_len = len(key) + ctx._impl = impl + if ctx._impl == .Hardware && !is_hardware_accelerated() { + ctx._impl = .Portable + } + ctx._is_initialized = true +} + +// seal encrypts the plaintext and authenticates the aad and ciphertext, +// with the provided Context and iv, stores the output in dst and tag. +// +// dst and plaintext MUST alias exactly or not at all. +seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) { + ensure(ctx._is_initialized) + + _validate_common_slice_sizes(ctx, tag, iv, aad, plaintext) + ensure(len(dst) == len(plaintext), "crypto/aegis: invalid destination ciphertext size") + ensure(!bytes.alias_inexactly(dst, plaintext), "crypto/aegis: dst and plaintext alias inexactly") + + switch ctx._impl { + case .Hardware: + st: State_HW + defer reset_state_hw(&st) + + init_hw(ctx, &st, iv) + + aad_len, pt_len := len(aad), len(plaintext) + if aad_len > 0 { + absorb_hw(&st, aad) + } + + if pt_len > 0 { + enc_hw(&st, dst, plaintext) + } + + finalize_hw(&st, tag, aad_len, pt_len) + case .Portable: + st: State_SW + defer reset_state_sw(&st) + + init_sw(ctx, &st, iv) + + aad_len, pt_len := len(aad), len(plaintext) + if aad_len > 0 { + absorb_sw(&st, aad) + } + + if pt_len > 0 { + enc_sw(&st, dst, plaintext) + } + + finalize_sw(&st, tag, aad_len, pt_len) + case: + panic("core/crypto/aegis: not implemented") + } +} + +// open authenticates the aad and ciphertext, and decrypts the ciphertext, +// with the provided Context, iv, and tag, and stores the output in dst, +// returning true iff the authentication was successful. If authentication +// fails, the destination buffer will be zeroed. +// +// dst and plaintext MUST alias exactly or not at all. +@(require_results) +open :: proc(ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool { + ensure(ctx._is_initialized) + + _validate_common_slice_sizes(ctx, tag, iv, aad, ciphertext) + ensure(len(dst) == len(ciphertext), "crypto/aegis: invalid destination plaintext size") + ensure(!bytes.alias_inexactly(dst, ciphertext), "crypto/aegis: dst and ciphertext alias inexactly") + + tmp: [TAG_SIZE_256]byte + derived_tag := tmp[:len(tag)] + aad_len, ct_len := len(aad), len(ciphertext) + + switch ctx._impl { + case .Hardware: + st: State_HW + defer reset_state_hw(&st) + + init_hw(ctx, &st, iv) + + if aad_len > 0 { + absorb_hw(&st, aad) + } + + if ct_len > 0 { + dec_hw(&st, dst, ciphertext) + } + + finalize_hw(&st, derived_tag, aad_len, ct_len) + case .Portable: + st: State_SW + defer reset_state_sw(&st) + + init_sw(ctx, &st, iv) + + if aad_len > 0 { + absorb_sw(&st, aad) + } + + if ct_len > 0 { + dec_sw(&st, dst, ciphertext) + } + + finalize_sw(&st, derived_tag, aad_len, ct_len) + case: + panic("core/crypto/aegis: not implemented") + } + + if crypto.compare_constant_time(tag, derived_tag) != 1 { + mem.zero_explicit(raw_data(derived_tag), len(derived_tag)) + mem.zero_explicit(raw_data(dst), ct_len) + return false + } + + return true +} + +// reset sanitizes the Context. The Context must be +// re-initialized to be used again. +reset :: proc "contextless" (ctx: ^Context) { + mem.zero_explicit(&ctx._key, len(ctx._key)) + ctx._key_len = 0 + ctx._is_initialized = false +} diff --git a/core/crypto/aegis/aegis_impl_ct64.odin b/core/crypto/aegis/aegis_impl_ct64.odin new file mode 100644 index 000000000..4813b37ec --- /dev/null +++ b/core/crypto/aegis/aegis_impl_ct64.odin @@ -0,0 +1,452 @@ +package aegis + +import aes "core:crypto/_aes/ct64" +import "core:encoding/endian" +import "core:mem" + +// This uses the bitlsiced 64-bit general purpose register SWAR AES +// round function. The intermediate state is stored in interleaved +// but NOT orthogonalized form, as leaving things in the orthgonalized +// format would overly complicate the update implementation. +// +// Note/perf: Per Frank Denis and a review of the specification, it is +// possible to gain slightly more performance by leaving the state in +// orthogonalized form while doing initialization, finalization, and +// absorbing AAD. This implementation opts out of those optimizations +// for the sake of simplicity. +// +// The update function leverages the paralleism (4xblocks) at once. + +@(private) +State_SW :: struct { + s0_0, s0_1: u64, + s1_0, s1_1: u64, + s2_0, s2_1: u64, + s3_0, s3_1: u64, + s4_0, s4_1: u64, + s5_0, s5_1: u64, + s6_0, s6_1: u64, + s7_0, s7_1: u64, + q_k, q_b: [8]u64, + rate: int, +} + +@(private) +init_sw :: proc "contextless" (ctx: ^Context, st: ^State_SW, iv: []byte) { + switch ctx._key_len { + case KEY_SIZE_128L: + key_0, key_1 := aes.load_interleaved(ctx._key[:16]) + iv_0, iv_1 := aes.load_interleaved(iv) + + st.s0_0, st.s0_1 = aes.xor_interleaved(key_0, key_1, iv_0, iv_1) + st.s1_0, st.s1_1 = aes.load_interleaved(_C1[:]) + st.s2_0, st.s2_1 = aes.load_interleaved(_C0[:]) + st.s3_0, st.s3_1 = st.s1_0, st.s1_1 + st.s4_0, st.s4_1 = st.s0_0, st.s0_1 + st.s5_0, st.s5_1 = aes.xor_interleaved(key_0, key_1, st.s2_0, st.s2_1) + st.s6_0, st.s6_1 = aes.xor_interleaved(key_0, key_1, st.s1_0, st.s1_1) + st.s7_0, st.s7_1 = st.s5_0, st.s5_1 + st.rate = _RATE_128L + + for _ in 0 ..< 10 { + update_sw_128l(st, iv_0, iv_1, key_0, key_1) + } + case KEY_SIZE_256: + k0_0, k0_1 := aes.load_interleaved(ctx._key[:16]) + k1_0, k1_1 := aes.load_interleaved(ctx._key[16:]) + n0_0, n0_1 := aes.load_interleaved(iv[:16]) + n1_0, n1_1 := aes.load_interleaved(iv[16:]) + + st.s0_0, st.s0_1 = aes.xor_interleaved(k0_0, k0_1, n0_0, n0_1) + st.s1_0, st.s1_1 = aes.xor_interleaved(k1_0, k1_1, n1_0, n1_1) + st.s2_0, st.s2_1 = aes.load_interleaved(_C1[:]) + st.s3_0, st.s3_1 = aes.load_interleaved(_C0[:]) + st.s4_0, st.s4_1 = aes.xor_interleaved(k0_0, k0_1, st.s3_0, st.s3_1) + st.s5_0, st.s5_1 = aes.xor_interleaved(k1_0, k1_1, st.s2_0, st.s2_1) + st.rate = _RATE_256 + + u0_0, u0_1, u1_0, u1_1 := st.s0_0, st.s0_1, st.s1_0, st.s1_1 + for _ in 0 ..< 4 { + update_sw_256(st, k0_0, k0_1) + update_sw_256(st, k1_0, k1_1) + update_sw_256(st, u0_0, u0_1) + update_sw_256(st, u1_0, u1_1) + } + } +} + +@(private = "file") +update_sw_128l :: proc "contextless" (st: ^State_SW, m0_0, m0_1, m1_0, m1_1: u64) { + st.q_k[0], st.q_k[4] = aes.xor_interleaved(st.s0_0, st.s0_1, m0_0, m0_1) + st.q_k[1], st.q_k[5] = st.s1_0, st.s1_1 + st.q_k[2], st.q_k[6] = st.s2_0, st.s2_1 + st.q_k[3], st.q_k[7] = st.s3_0, st.s3_1 + aes.orthogonalize(&st.q_k) + + st.q_b[0], st.q_b[4] = st.s7_0, st.s7_1 + st.q_b[1], st.q_b[5] = st.s0_0, st.s0_1 + st.q_b[2], st.q_b[6] = st.s1_0, st.s1_1 + st.q_b[3], st.q_b[7] = st.s2_0, st.s2_1 + aes.orthogonalize(&st.q_b) + + aes.sub_bytes(&st.q_b) + aes.shift_rows(&st.q_b) + aes.mix_columns(&st.q_b) + aes.add_round_key(&st.q_b, st.q_k[:]) + aes.orthogonalize(&st.q_b) + + st.s0_0, st.s0_1 = st.q_b[0], st.q_b[4] + st.s1_0, st.s1_1 = st.q_b[1], st.q_b[5] + st.s2_0, st.s2_1 = st.q_b[2], st.q_b[6] + s3_0, s3_1 := st.q_b[3], st.q_b[7] + + st.q_k[0], st.q_k[4] = aes.xor_interleaved(st.s4_0, st.s4_1, m1_0, m1_1) + st.q_k[1], st.q_k[5] = st.s5_0, st.s5_1 + st.q_k[2], st.q_k[6] = st.s6_0, st.s6_1 + st.q_k[3], st.q_k[7] = st.s7_0, st.s7_1 + aes.orthogonalize(&st.q_k) + + st.q_b[0], st.q_b[4] = st.s3_0, st.s3_1 + st.q_b[1], st.q_b[5] = st.s4_0, st.s4_1 + st.q_b[2], st.q_b[6] = st.s5_0, st.s5_1 + st.q_b[3], st.q_b[7] = st.s6_0, st.s6_1 + aes.orthogonalize(&st.q_b) + + aes.sub_bytes(&st.q_b) + aes.shift_rows(&st.q_b) + aes.mix_columns(&st.q_b) + aes.add_round_key(&st.q_b, st.q_k[:]) + aes.orthogonalize(&st.q_b) + + st.s3_0, st.s3_1 = s3_0, s3_1 + st.s4_0, st.s4_1 = st.q_b[0], st.q_b[4] + st.s5_0, st.s5_1 = st.q_b[1], st.q_b[5] + st.s6_0, st.s6_1 = st.q_b[2], st.q_b[6] + st.s7_0, st.s7_1 = st.q_b[3], st.q_b[7] +} + +@(private = "file") +update_sw_256 :: proc "contextless" (st: ^State_SW, m_0, m_1: u64) { + st.q_k[0], st.q_k[4] = aes.xor_interleaved(st.s0_0, st.s0_1, m_0, m_1) + st.q_k[1], st.q_k[5] = st.s1_0, st.s1_1 + st.q_k[2], st.q_k[6] = st.s2_0, st.s2_1 + st.q_k[3], st.q_k[7] = st.s3_0, st.s3_1 + aes.orthogonalize(&st.q_k) + + st.q_b[0], st.q_b[4] = st.s5_0, st.s5_1 + st.q_b[1], st.q_b[5] = st.s0_0, st.s0_1 + st.q_b[2], st.q_b[6] = st.s1_0, st.s1_1 + st.q_b[3], st.q_b[7] = st.s2_0, st.s2_1 + aes.orthogonalize(&st.q_b) + + aes.sub_bytes(&st.q_b) + aes.shift_rows(&st.q_b) + aes.mix_columns(&st.q_b) + aes.add_round_key(&st.q_b, st.q_k[:]) + aes.orthogonalize(&st.q_b) + + st.s0_0, st.s0_1 = st.q_b[0], st.q_b[4] + st.s1_0, st.s1_1 = st.q_b[1], st.q_b[5] + st.s2_0, st.s2_1 = st.q_b[2], st.q_b[6] + s3_0, s3_1 := st.q_b[3], st.q_b[7] + + st.q_k[0], st.q_k[4] = st.s4_0, st.s4_1 + st.q_k[1], st.q_k[5] = st.s5_0, st.s5_1 + aes.orthogonalize(&st.q_k) + + st.q_b[0], st.q_b[4] = st.s3_0, st.s3_1 + st.q_b[1], st.q_b[5] = st.s4_0, st.s4_1 + aes.orthogonalize(&st.q_b) + + aes.sub_bytes(&st.q_b) + aes.shift_rows(&st.q_b) + aes.mix_columns(&st.q_b) + aes.add_round_key(&st.q_b, st.q_k[:]) + aes.orthogonalize(&st.q_b) + + st.s3_0, st.s3_1 = s3_0, s3_1 + st.s4_0, st.s4_1 = st.q_b[0], st.q_b[4] + st.s5_0, st.s5_1 = st.q_b[1], st.q_b[5] +} + +@(private = "file") +absorb_sw_128l :: #force_inline proc "contextless" (st: ^State_SW, ai: []byte) #no_bounds_check { + t0_0, t0_1 := aes.load_interleaved(ai[:16]) + t1_0, t1_1 := aes.load_interleaved(ai[16:]) + update_sw_128l(st, t0_0, t0_1, t1_0, t1_1) +} + +@(private = "file") +absorb_sw_256 :: #force_inline proc "contextless" (st: ^State_SW, ai: []byte) { + m_0, m_1 := aes.load_interleaved(ai) + update_sw_256(st, m_0, m_1) +} + +@(private) +absorb_sw :: proc "contextless" (st: ^State_SW, aad: []byte) #no_bounds_check { + ai, l := aad, len(aad) + + switch st.rate { + case _RATE_128L: + for l >= _RATE_128L { + absorb_sw_128l(st, ai) + ai = ai[_RATE_128L:] + l -= _RATE_128L + } + case _RATE_256: + for l >= _RATE_256 { + absorb_sw_256(st, ai) + + ai = ai[_RATE_256:] + l -= _RATE_256 + } + } + + // Pad out the remainder with `0`s till it is rate sized. + if l > 0 { + tmp: [_RATE_MAX]byte // AAD is not confidential. + copy(tmp[:], ai) + switch st.rate { + case _RATE_128L: + absorb_sw_128l(st, tmp[:]) + case _RATE_256: + absorb_sw_256(st, tmp[:]) + } + } +} + +@(private = "file", require_results) +z_sw_128l :: proc "contextless" (st: ^State_SW) -> (u64, u64, u64, u64) { + z0_0, z0_1 := aes.and_interleaved(st.s2_0, st.s2_1, st.s3_0, st.s3_1) + z0_0, z0_1 = aes.xor_interleaved(st.s1_0, st.s1_1, z0_0, z0_1) + z0_0, z0_1 = aes.xor_interleaved(st.s6_0, st.s6_1, z0_0, z0_1) + + z1_0, z1_1 := aes.and_interleaved(st.s6_0, st.s6_1, st.s7_0, st.s7_1) + z1_0, z1_1 = aes.xor_interleaved(st.s5_0, st.s5_1, z1_0, z1_1) + z1_0, z1_1 = aes.xor_interleaved(st.s2_0, st.s2_1, z1_0, z1_1) + + return z0_0, z0_1, z1_0, z1_1 +} + +@(private = "file", require_results) +z_sw_256 :: proc "contextless" (st: ^State_SW) -> (u64, u64) { + z_0, z_1 := aes.and_interleaved(st.s2_0, st.s2_1, st.s3_0, st.s3_1) + z_0, z_1 = aes.xor_interleaved(st.s5_0, st.s5_1, z_0, z_1) + z_0, z_1 = aes.xor_interleaved(st.s4_0, st.s4_1, z_0, z_1) + return aes.xor_interleaved(st.s1_0, st.s1_1, z_0, z_1) +} + +@(private = "file") +enc_sw_128l :: #force_inline proc "contextless" (st: ^State_SW, ci, xi: []byte) #no_bounds_check { + z0_0, z0_1, z1_0, z1_1 := z_sw_128l(st) + + t0_0, t0_1 := aes.load_interleaved(xi[:16]) + t1_0, t1_1 := aes.load_interleaved(xi[16:]) + update_sw_128l(st, t0_0, t0_1, t1_0, t1_1) + + out0_0, out0_1 := aes.xor_interleaved(t0_0, t0_1, z0_0, z0_1) + out1_0, out1_1 := aes.xor_interleaved(t1_0, t1_1, z1_0, z1_1) + aes.store_interleaved(ci[:16], out0_0, out0_1) + aes.store_interleaved(ci[16:], out1_0, out1_1) +} + +@(private = "file") +enc_sw_256 :: #force_inline proc "contextless" (st: ^State_SW, ci, xi: []byte) #no_bounds_check { + z_0, z_1 := z_sw_256(st) + + xi_0, xi_1 := aes.load_interleaved(xi) + update_sw_256(st, xi_0, xi_1) + + ci_0, ci_1 := aes.xor_interleaved(xi_0, xi_1, z_0, z_1) + aes.store_interleaved(ci, ci_0, ci_1) +} + +@(private) +enc_sw :: proc "contextless" (st: ^State_SW, dst, src: []byte) #no_bounds_check { + ci, xi, l := dst, src, len(src) + + switch st.rate { + case _RATE_128L: + for l >= _RATE_128L { + enc_sw_128l(st, ci, xi) + ci = ci[_RATE_128L:] + xi = xi[_RATE_128L:] + l -= _RATE_128L + } + case _RATE_256: + for l >= _RATE_256 { + enc_sw_256(st, ci, xi) + ci = ci[_RATE_256:] + xi = xi[_RATE_256:] + l -= _RATE_256 + } + } + + // Pad out the remainder with `0`s till it is rate sized. + if l > 0 { + tmp: [_RATE_MAX]byte // Ciphertext is not confidential. + copy(tmp[:], xi) + switch st.rate { + case _RATE_128L: + enc_sw_128l(st, tmp[:], tmp[:]) + case _RATE_256: + enc_sw_256(st, tmp[:], tmp[:]) + } + copy(ci, tmp[:l]) + } +} + +@(private = "file") +dec_sw_128l :: #force_inline proc "contextless" (st: ^State_SW, xi, ci: []byte) #no_bounds_check { + z0_0, z0_1, z1_0, z1_1 := z_sw_128l(st) + + t0_0, t0_1 := aes.load_interleaved(ci[:16]) + t1_0, t1_1 := aes.load_interleaved(ci[16:]) + out0_0, out0_1 := aes.xor_interleaved(t0_0, t0_1, z0_0, z0_1) + out1_0, out1_1 := aes.xor_interleaved(t1_0, t1_1, z1_0, z1_1) + + update_sw_128l(st, out0_0, out0_1, out1_0, out1_1) + aes.store_interleaved(xi[:16], out0_0, out0_1) + aes.store_interleaved(xi[16:], out1_0, out1_1) +} + +@(private = "file") +dec_sw_256 :: #force_inline proc "contextless" (st: ^State_SW, xi, ci: []byte) #no_bounds_check { + z_0, z_1 := z_sw_256(st) + + ci_0, ci_1 := aes.load_interleaved(ci) + xi_0, xi_1 := aes.xor_interleaved(ci_0, ci_1, z_0, z_1) + + update_sw_256(st, xi_0, xi_1) + aes.store_interleaved(xi, xi_0, xi_1) +} + +@(private = "file") +dec_partial_sw_128l :: proc "contextless" (st: ^State_SW, xn, cn: []byte) #no_bounds_check { + tmp: [_RATE_128L]byte + defer mem.zero_explicit(&tmp, size_of(tmp)) + + z0_0, z0_1, z1_0, z1_1 := z_sw_128l(st) + copy(tmp[:], cn) + + t0_0, t0_1 := aes.load_interleaved(tmp[:16]) + t1_0, t1_1 := aes.load_interleaved(tmp[16:]) + out0_0, out0_1 := aes.xor_interleaved(t0_0, t0_1, z0_0, z0_1) + out1_0, out1_1 := aes.xor_interleaved(t1_0, t1_1, z1_0, z1_1) + + aes.store_interleaved(tmp[:16], out0_0, out0_1) + aes.store_interleaved(tmp[16:], out1_0, out1_1) + copy(xn, tmp[:]) + + for off := len(xn); off < _RATE_128L; off += 1 { + tmp[off] = 0 + } + out0_0, out0_1 = aes.load_interleaved(tmp[:16]) + out1_0, out1_1 = aes.load_interleaved(tmp[16:]) + update_sw_128l(st, out0_0, out0_1, out1_0, out1_1) +} + +@(private = "file") +dec_partial_sw_256 :: proc "contextless" (st: ^State_SW, xn, cn: []byte) #no_bounds_check { + tmp: [_RATE_256]byte + defer mem.zero_explicit(&tmp, size_of(tmp)) + + z_0, z_1 := z_sw_256(st) + copy(tmp[:], cn) + + cn_0, cn_1 := aes.load_interleaved(tmp[:]) + xn_0, xn_1 := aes.xor_interleaved(cn_0, cn_1, z_0, z_1) + + aes.store_interleaved(tmp[:], xn_0, xn_1) + copy(xn, tmp[:]) + + for off := len(xn); off < _RATE_256; off += 1 { + tmp[off] = 0 + } + xn_0, xn_1 = aes.load_interleaved(tmp[:]) + update_sw_256(st, xn_0, xn_1) +} + +@(private) +dec_sw :: proc "contextless" (st: ^State_SW, dst, src: []byte) #no_bounds_check { + xi, ci, l := dst, src, len(src) + + switch st.rate { + case _RATE_128L: + for l >= _RATE_128L { + dec_sw_128l(st, xi, ci) + xi = xi[_RATE_128L:] + ci = ci[_RATE_128L:] + l -= _RATE_128L + } + case _RATE_256: + for l >= _RATE_256 { + dec_sw_256(st, xi, ci) + xi = xi[_RATE_256:] + ci = ci[_RATE_256:] + l -= _RATE_256 + } + } + + // Process the remainder. + if l > 0 { + switch st.rate { + case _RATE_128L: + dec_partial_sw_128l(st, xi, ci) + case _RATE_256: + dec_partial_sw_256(st, xi, ci) + } + } +} + +@(private) +finalize_sw :: proc "contextless" (st: ^State_SW, tag: []byte, ad_len, msg_len: int) { + tmp: [16]byte + endian.unchecked_put_u64le(tmp[0:], u64(ad_len) * 8) + endian.unchecked_put_u64le(tmp[8:], u64(msg_len) * 8) + + t_0, t_1 := aes.load_interleaved(tmp[:]) + + t0_0, t0_1, t1_0, t1_1: u64 = ---, ---, ---, --- + switch st.rate { + case _RATE_128L: + t_0, t_1 = aes.xor_interleaved(st.s2_0, st.s2_1, t_0, t_1) + for _ in 0 ..< 7 { + update_sw_128l(st, t_0, t_1, t_0, t_1) + } + + t0_0, t0_1 = aes.xor_interleaved(st.s0_0, st.s0_1, st.s1_0, st.s1_1) + t0_0, t0_1 = aes.xor_interleaved(t0_0, t0_1, st.s2_0, st.s2_1) + t0_0, t0_1 = aes.xor_interleaved(t0_0, t0_1, st.s3_0, st.s3_1) + + t1_0, t1_1 = aes.xor_interleaved(st.s4_0, st.s4_1, st.s5_0, st.s5_1) + t1_0, t1_1 = aes.xor_interleaved(t1_0, t1_1, st.s6_0, st.s6_1) + if len(tag) == TAG_SIZE_256 { + t1_0, t1_1 = aes.xor_interleaved(t1_0, t1_1, st.s7_0, st.s7_1) + } + case _RATE_256: + t_0, t_1 = aes.xor_interleaved(st.s3_0, st.s3_1, t_0, t_1) + for _ in 0 ..< 7 { + update_sw_256(st, t_0, t_1) + } + + t0_0, t0_1 = aes.xor_interleaved(st.s0_0, st.s0_1, st.s1_0, st.s1_1) + t0_0, t0_1 = aes.xor_interleaved(t0_0, t0_1, st.s2_0, st.s2_1) + + t1_0, t1_1 = aes.xor_interleaved(st.s3_0, st.s3_1, st.s4_0, st.s4_1) + t1_0, t1_1 = aes.xor_interleaved(t1_0, t1_1, st.s5_0, st.s5_1) + } + switch len(tag) { + case TAG_SIZE_128: + t0_0, t0_1 = aes.xor_interleaved(t0_0, t0_1, t1_0, t1_1) + aes.store_interleaved(tag, t0_0, t0_1) + case TAG_SIZE_256: + aes.store_interleaved(tag[:16], t0_0, t0_1) + aes.store_interleaved(tag[16:], t1_0, t1_1) + } +} + +@(private) +reset_state_sw :: proc "contextless" (st: ^State_SW) { + mem.zero_explicit(st, size_of(st^)) +} diff --git a/core/crypto/aegis/aegis_impl_hw_gen.odin b/core/crypto/aegis/aegis_impl_hw_gen.odin new file mode 100644 index 000000000..5ec2f3d6e --- /dev/null +++ b/core/crypto/aegis/aegis_impl_hw_gen.odin @@ -0,0 +1,44 @@ +#+build !amd64 +package aegis + +@(private = "file") +ERR_HW_NOT_SUPPORTED :: "crypto/aegis: hardware implementation unsupported" + +@(private) +State_HW :: struct {} + +// is_hardware_accelerated returns true iff hardware accelerated AEGIS +// is supported. +is_hardware_accelerated :: proc "contextless" () -> bool { + return false +} + +@(private) +init_hw :: proc "contextless" (ctx: ^Context, st: ^State_HW, iv: []byte) { + panic_contextless(ERR_HW_NOT_SUPPORTED) +} + +@(private) +absorb_hw :: proc "contextless" (st: ^State_HW, aad: []byte) { + panic_contextless(ERR_HW_NOT_SUPPORTED) +} + +@(private) +enc_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) { + panic_contextless(ERR_HW_NOT_SUPPORTED) +} + +@(private) +dec_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) { + panic_contextless(ERR_HW_NOT_SUPPORTED) +} + +@(private) +finalize_hw :: proc "contextless" (st: ^State_HW, tag: []byte, ad_len, msg_len: int) { + panic_contextless(ERR_HW_NOT_SUPPORTED) +} + +@(private) +reset_state_hw :: proc "contextless" (st: ^State_HW) { + panic_contextless(ERR_HW_NOT_SUPPORTED) +} diff --git a/core/crypto/aegis/aegis_impl_hw_intel.odin b/core/crypto/aegis/aegis_impl_hw_intel.odin new file mode 100644 index 000000000..5334f3258 --- /dev/null +++ b/core/crypto/aegis/aegis_impl_hw_intel.odin @@ -0,0 +1,389 @@ +#+build amd64 +package aegis + +import "base:intrinsics" +import "core:crypto/aes" +import "core:encoding/endian" +import "core:mem" +import "core:simd/x86" + +@(private) +State_HW :: struct { + s0: x86.__m128i, + s1: x86.__m128i, + s2: x86.__m128i, + s3: x86.__m128i, + s4: x86.__m128i, + s5: x86.__m128i, + s6: x86.__m128i, + s7: x86.__m128i, + rate: int, +} + +// is_hardware_accelerated returns true iff hardware accelerated AEGIS +// is supported. +is_hardware_accelerated :: proc "contextless" () -> bool { + return aes.is_hardware_accelerated() +} + +@(private, enable_target_feature = "sse2,aes") +init_hw :: proc "contextless" (ctx: ^Context, st: ^State_HW, iv: []byte) { + switch ctx._key_len { + case KEY_SIZE_128L: + key := intrinsics.unaligned_load((^x86.__m128i)(&ctx._key[0])) + iv := intrinsics.unaligned_load((^x86.__m128i)(raw_data(iv))) + + st.s0 = x86._mm_xor_si128(key, iv) + st.s1 = intrinsics.unaligned_load((^x86.__m128i)(&_C1[0])) + st.s2 = intrinsics.unaligned_load((^x86.__m128i)(&_C0[0])) + st.s3 = st.s1 + st.s4 = st.s0 + st.s5 = x86._mm_xor_si128(key, st.s2) // key ^ C0 + st.s6 = x86._mm_xor_si128(key, st.s1) // key ^ C1 + st.s7 = st.s5 + st.rate = _RATE_128L + + for _ in 0 ..< 10 { + update_hw_128l(st, iv, key) + } + case KEY_SIZE_256: + k0 := intrinsics.unaligned_load((^x86.__m128i)(&ctx._key[0])) + k1 := intrinsics.unaligned_load((^x86.__m128i)(&ctx._key[16])) + n0 := intrinsics.unaligned_load((^x86.__m128i)(&iv[0])) + n1 := intrinsics.unaligned_load((^x86.__m128i)(&iv[16])) + + st.s0 = x86._mm_xor_si128(k0, n0) + st.s1 = x86._mm_xor_si128(k1, n1) + st.s2 = intrinsics.unaligned_load((^x86.__m128i)(&_C1[0])) + st.s3 = intrinsics.unaligned_load((^x86.__m128i)(&_C0[0])) + st.s4 = x86._mm_xor_si128(k0, st.s3) // k0 ^ C0 + st.s5 = x86._mm_xor_si128(k1, st.s2) // k1 ^ C1 + st.rate = _RATE_256 + + u0, u1 := st.s0, st.s1 + for _ in 0 ..< 4 { + update_hw_256(st, k0) + update_hw_256(st, k1) + update_hw_256(st, u0) + update_hw_256(st, u1) + } + } +} + +@(private = "file", enable_target_feature = "sse2,aes") +update_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, m0, m1: x86.__m128i) { + s0_ := x86._mm_aesenc_si128(st.s7, x86._mm_xor_si128(st.s0, m0)) + s1_ := x86._mm_aesenc_si128(st.s0, st.s1) + s2_ := x86._mm_aesenc_si128(st.s1, st.s2) + s3_ := x86._mm_aesenc_si128(st.s2, st.s3) + s4_ := x86._mm_aesenc_si128(st.s3, x86._mm_xor_si128(st.s4, m1)) + s5_ := x86._mm_aesenc_si128(st.s4, st.s5) + s6_ := x86._mm_aesenc_si128(st.s5, st.s6) + s7_ := x86._mm_aesenc_si128(st.s6, st.s7) + st.s0, st.s1, st.s2, st.s3, st.s4, st.s5, st.s6, st.s7 = s0_, s1_, s2_, s3_, s4_, s5_, s6_, s7_ +} + +@(private = "file", enable_target_feature = "sse2,aes") +update_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, m: x86.__m128i) { + s0_ := x86._mm_aesenc_si128(st.s5, x86._mm_xor_si128(st.s0, m)) + s1_ := x86._mm_aesenc_si128(st.s0, st.s1) + s2_ := x86._mm_aesenc_si128(st.s1, st.s2) + s3_ := x86._mm_aesenc_si128(st.s2, st.s3) + s4_ := x86._mm_aesenc_si128(st.s3, st.s4) + s5_ := x86._mm_aesenc_si128(st.s4, st.s5) + st.s0, st.s1, st.s2, st.s3, st.s4, st.s5 = s0_, s1_, s2_, s3_, s4_, s5_ +} + +@(private = "file", enable_target_feature = "sse2,aes") +absorb_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) { + t0 := intrinsics.unaligned_load((^x86.__m128i)(&ai[0])) + t1 := intrinsics.unaligned_load((^x86.__m128i)(&ai[16])) + update_hw_128l(st, t0, t1) +} + +@(private = "file", enable_target_feature = "sse2,aes") +absorb_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ai: []byte) { + m := intrinsics.unaligned_load((^x86.__m128i)(&ai[0])) + update_hw_256(st, m) +} + +@(private, enable_target_feature = "sse2,aes") +absorb_hw :: proc "contextless" (st: ^State_HW, aad: []byte) #no_bounds_check { + ai, l := aad, len(aad) + + switch st.rate { + case _RATE_128L: + for l >= _RATE_128L { + absorb_hw_128l(st, ai) + ai = ai[_RATE_128L:] + l -= _RATE_128L + } + case _RATE_256: + for l >= _RATE_256 { + absorb_hw_256(st, ai) + + ai = ai[_RATE_256:] + l -= _RATE_256 + } + } + + // Pad out the remainder with `0`s till it is rate sized. + if l > 0 { + tmp: [_RATE_MAX]byte // AAD is not confidential. + copy(tmp[:], ai) + switch st.rate { + case _RATE_128L: + absorb_hw_128l(st, tmp[:]) + case _RATE_256: + absorb_hw_256(st, tmp[:]) + } + } +} + +@(private = "file", enable_target_feature = "sse2", require_results) +z_hw_128l :: #force_inline proc "contextless" (st: ^State_HW) -> (x86.__m128i, x86.__m128i) { + z0 := x86._mm_xor_si128( + st.s6, + x86._mm_xor_si128( + st.s1, + x86._mm_and_si128(st.s2, st.s3), + ), + ) + z1 := x86._mm_xor_si128( + st.s2, + x86._mm_xor_si128( + st.s5, + x86._mm_and_si128(st.s6, st.s7), + ), + ) + return z0, z1 +} + +@(private = "file", enable_target_feature = "sse2", require_results) +z_hw_256 :: #force_inline proc "contextless" (st: ^State_HW) -> x86.__m128i { + return x86._mm_xor_si128( + st.s1, + x86._mm_xor_si128( + st.s4, + x86._mm_xor_si128( + st.s5, + x86._mm_and_si128(st.s2, st.s3), + ), + ), + ) +} + +@(private = "file", enable_target_feature = "sse2,aes") +enc_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check { + z0, z1 := z_hw_128l(st) + + t0 := intrinsics.unaligned_load((^x86.__m128i)(&xi[0])) + t1 := intrinsics.unaligned_load((^x86.__m128i)(&xi[16])) + update_hw_128l(st, t0, t1) + + out0 := x86._mm_xor_si128(t0, z0) + out1 := x86._mm_xor_si128(t1, z1) + intrinsics.unaligned_store((^x86.__m128i)(&ci[0]), out0) + intrinsics.unaligned_store((^x86.__m128i)(&ci[16]), out1) +} + +@(private = "file", enable_target_feature = "sse2,aes") +enc_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, ci, xi: []byte) #no_bounds_check { + z := z_hw_256(st) + + xi_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(xi))) + update_hw_256(st, xi_) + + ci_ := x86._mm_xor_si128(xi_, z) + intrinsics.unaligned_store((^x86.__m128i)(raw_data(ci)), ci_) +} + +@(private, enable_target_feature = "sse2,aes") +enc_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check { + ci, xi, l := dst, src, len(src) + + switch st.rate { + case _RATE_128L: + for l >= _RATE_128L { + enc_hw_128l(st, ci, xi) + ci = ci[_RATE_128L:] + xi = xi[_RATE_128L:] + l -= _RATE_128L + } + case _RATE_256: + for l >= _RATE_256 { + enc_hw_256(st, ci, xi) + ci = ci[_RATE_256:] + xi = xi[_RATE_256:] + l -= _RATE_256 + } + } + + // Pad out the remainder with `0`s till it is rate sized. + if l > 0 { + tmp: [_RATE_MAX]byte // Ciphertext is not confidential. + copy(tmp[:], xi) + switch st.rate { + case _RATE_128L: + enc_hw_128l(st, tmp[:], tmp[:]) + case _RATE_256: + enc_hw_256(st, tmp[:], tmp[:]) + } + copy(ci, tmp[:l]) + } +} + +@(private = "file", enable_target_feature = "sse2,aes") +dec_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check { + z0, z1 := z_hw_128l(st) + + t0 := intrinsics.unaligned_load((^x86.__m128i)(&ci[0])) + t1 := intrinsics.unaligned_load((^x86.__m128i)(&ci[16])) + out0 := x86._mm_xor_si128(t0, z0) + out1 := x86._mm_xor_si128(t1, z1) + + update_hw_128l(st, out0, out1) + intrinsics.unaligned_store((^x86.__m128i)(&xi[0]), out0) + intrinsics.unaligned_store((^x86.__m128i)(&xi[16]), out1) +} + +@(private = "file", enable_target_feature = "sse2,aes") +dec_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xi, ci: []byte) #no_bounds_check { + z := z_hw_256(st) + + ci_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(ci))) + xi_ := x86._mm_xor_si128(ci_, z) + + update_hw_256(st, xi_) + intrinsics.unaligned_store((^x86.__m128i)(raw_data(xi)), xi_) +} + +@(private = "file", enable_target_feature = "sse2,aes") +dec_partial_hw_128l :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check { + tmp: [_RATE_128L]byte + defer mem.zero_explicit(&tmp, size_of(tmp)) + + z0, z1 := z_hw_128l(st) + copy(tmp[:], cn) + + t0 := intrinsics.unaligned_load((^x86.__m128i)(&tmp[0])) + t1 := intrinsics.unaligned_load((^x86.__m128i)(&tmp[16])) + out0 := x86._mm_xor_si128(t0, z0) + out1 := x86._mm_xor_si128(t1, z1) + + intrinsics.unaligned_store((^x86.__m128i)(&tmp[0]), out0) + intrinsics.unaligned_store((^x86.__m128i)(&tmp[16]), out1) + copy(xn, tmp[:]) + + for off := len(xn); off < _RATE_128L; off += 1 { + tmp[off] = 0 + } + out0 = intrinsics.unaligned_load((^x86.__m128i)(&tmp[0])) // v0 + out1 = intrinsics.unaligned_load((^x86.__m128i)(&tmp[16])) // v1 + update_hw_128l(st, out0, out1) +} + +@(private = "file", enable_target_feature = "sse2,aes") +dec_partial_hw_256 :: #force_inline proc "contextless" (st: ^State_HW, xn, cn: []byte) #no_bounds_check { + tmp: [_RATE_256]byte + defer mem.zero_explicit(&tmp, size_of(tmp)) + + z := z_hw_256(st) + copy(tmp[:], cn) + + cn_ := intrinsics.unaligned_load((^x86.__m128i)(&tmp[0])) + xn_ := x86._mm_xor_si128(cn_, z) + + intrinsics.unaligned_store((^x86.__m128i)(&tmp[0]), xn_) + copy(xn, tmp[:]) + + for off := len(xn); off < _RATE_256; off += 1 { + tmp[off] = 0 + } + xn_ = intrinsics.unaligned_load((^x86.__m128i)(&tmp[0])) + update_hw_256(st, xn_) +} + +@(private, enable_target_feature = "sse2,aes") +dec_hw :: proc "contextless" (st: ^State_HW, dst, src: []byte) #no_bounds_check { + xi, ci, l := dst, src, len(src) + + switch st.rate { + case _RATE_128L: + for l >= _RATE_128L { + dec_hw_128l(st, xi, ci) + xi = xi[_RATE_128L:] + ci = ci[_RATE_128L:] + l -= _RATE_128L + } + case _RATE_256: + for l >= _RATE_256 { + dec_hw_256(st, xi, ci) + xi = xi[_RATE_256:] + ci = ci[_RATE_256:] + l -= _RATE_256 + } + } + + // Process the remainder. + if l > 0 { + switch st.rate { + case _RATE_128L: + dec_partial_hw_128l(st, xi, ci) + case _RATE_256: + dec_partial_hw_256(st, xi, ci) + } + } +} + +@(private, enable_target_feature = "sse2,aes") +finalize_hw :: proc "contextless" (st: ^State_HW, tag: []byte, ad_len, msg_len: int) { + tmp: [16]byte + endian.unchecked_put_u64le(tmp[0:], u64(ad_len) * 8) + endian.unchecked_put_u64le(tmp[8:], u64(msg_len) * 8) + + t := intrinsics.unaligned_load((^x86.__m128i)(&tmp[0])) + + t0, t1: x86.__m128i = ---, --- + switch st.rate { + case _RATE_128L: + t = x86._mm_xor_si128(st.s2, t) + for _ in 0 ..< 7 { + update_hw_128l(st, t, t) + } + + t0 = x86._mm_xor_si128(st.s0, st.s1) + t0 = x86._mm_xor_si128(t0, st.s2) + t0 = x86._mm_xor_si128(t0, st.s3) + + t1 = x86._mm_xor_si128(st.s4, st.s5) + t1 = x86._mm_xor_si128(t1, st.s6) + if len(tag) == TAG_SIZE_256 { + t1 = x86._mm_xor_si128(t1, st.s7) + } + case _RATE_256: + t = x86._mm_xor_si128(st.s3, t) + for _ in 0 ..< 7 { + update_hw_256(st, t) + } + + t0 = x86._mm_xor_si128(st.s0, st.s1) + t0 = x86._mm_xor_si128(t0, st.s2) + + t1 = x86._mm_xor_si128(st.s3, st.s4) + t1 = x86._mm_xor_si128(t1, st.s5) + } + switch len(tag) { + case TAG_SIZE_128: + t0 = x86._mm_xor_si128(t0, t1) + intrinsics.unaligned_store((^x86.__m128i)(&tag[0]), t0) + case TAG_SIZE_256: + intrinsics.unaligned_store((^x86.__m128i)(&tag[0]), t0) + intrinsics.unaligned_store((^x86.__m128i)(&tag[16]), t1) + } +} + +@(private) +reset_state_hw :: proc "contextless" (st: ^State_HW) { + mem.zero_explicit(st, size_of(st^)) +} diff --git a/core/crypto/aes/aes_ctr.odin b/core/crypto/aes/aes_ctr.odin index 20b75e57f..a74133235 100644 --- a/core/crypto/aes/aes_ctr.odin +++ b/core/crypto/aes/aes_ctr.odin @@ -21,9 +21,7 @@ Context_CTR :: struct { // init_ctr initializes a Context_CTR with the provided key and IV. init_ctr :: proc(ctx: ^Context_CTR, key, iv: []byte, impl := DEFAULT_IMPLEMENTATION) { - if len(iv) != CTR_IV_SIZE { - panic("crypto/aes: invalid CTR IV size") - } + ensure(len(iv) == CTR_IV_SIZE, "crypto/aes: invalid CTR IV size") init_impl(&ctx._impl, key, impl) ctx._off = BLOCK_SIZE @@ -36,16 +34,14 @@ init_ctr :: proc(ctx: ^Context_CTR, key, iv: []byte, impl := DEFAULT_IMPLEMENTAT // keystream, and writes the resulting output to dst. dst and src MUST // alias exactly or not at all. xor_bytes_ctr :: proc(ctx: ^Context_CTR, dst, src: []byte) { - assert(ctx._is_initialized) + ensure(ctx._is_initialized) src, dst := src, dst if dst_len := len(dst); dst_len < len(src) { src = src[:dst_len] } - if bytes.alias_inexactly(dst, src) { - panic("crypto/aes: dst and src alias inexactly") - } + ensure(!bytes.alias_inexactly(dst, src), "crypto/aes: dst and src alias inexactly") #no_bounds_check for remaining := len(src); remaining > 0; { // Process multiple blocks at once @@ -82,7 +78,7 @@ xor_bytes_ctr :: proc(ctx: ^Context_CTR, dst, src: []byte) { // keystream_bytes_ctr fills dst with the raw AES-CTR keystream output. keystream_bytes_ctr :: proc(ctx: ^Context_CTR, dst: []byte) { - assert(ctx._is_initialized) + ensure(ctx._is_initialized) dst := dst #no_bounds_check for remaining := len(dst); remaining > 0; { diff --git a/core/crypto/aes/aes_ecb.odin b/core/crypto/aes/aes_ecb.odin index 32476006c..cac62de5d 100644 --- a/core/crypto/aes/aes_ecb.odin +++ b/core/crypto/aes/aes_ecb.odin @@ -19,11 +19,9 @@ init_ecb :: proc(ctx: ^Context_ECB, key: []byte, impl := DEFAULT_IMPLEMENTATION) // encrypt_ecb encrypts the BLOCK_SIZE buffer src, and writes the result to dst. encrypt_ecb :: proc(ctx: ^Context_ECB, dst, src: []byte) { - assert(ctx._is_initialized) - - if len(dst) != BLOCK_SIZE || len(src) != BLOCK_SIZE { - panic("crypto/aes: invalid buffer size(s)") - } + ensure(ctx._is_initialized) + ensure(len(dst) == BLOCK_SIZE, "crypto/aes: invalid dst size") + ensure(len(dst) == BLOCK_SIZE, "crypto/aes: invalid src size") switch &impl in ctx._impl { case ct64.Context: @@ -35,11 +33,9 @@ encrypt_ecb :: proc(ctx: ^Context_ECB, dst, src: []byte) { // decrypt_ecb decrypts the BLOCK_SIZE buffer src, and writes the result to dst. decrypt_ecb :: proc(ctx: ^Context_ECB, dst, src: []byte) { - assert(ctx._is_initialized) - - if len(dst) != BLOCK_SIZE || len(src) != BLOCK_SIZE { - panic("crypto/aes: invalid buffer size(s)") - } + ensure(ctx._is_initialized) + ensure(len(dst) == BLOCK_SIZE, "crypto/aes: invalid dst size") + ensure(len(dst) == BLOCK_SIZE, "crypto/aes: invalid src size") switch &impl in ctx._impl { case ct64.Context: diff --git a/core/crypto/aes/aes_gcm.odin b/core/crypto/aes/aes_gcm.odin index 8616821ce..d349aa353 100644 --- a/core/crypto/aes/aes_gcm.odin +++ b/core/crypto/aes/aes_gcm.odin @@ -36,15 +36,11 @@ init_gcm :: proc(ctx: ^Context_GCM, key: []byte, impl := DEFAULT_IMPLEMENTATION) // // dst and plaintext MUST alias exactly or not at all. seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, iv, aad, plaintext: []byte) { - assert(ctx._is_initialized) + ensure(ctx._is_initialized) gcm_validate_common_slice_sizes(tag, iv, aad, plaintext) - if len(dst) != len(plaintext) { - panic("crypto/aes: invalid destination ciphertext size") - } - if bytes.alias_inexactly(dst, plaintext) { - panic("crypto/aes: dst and plaintext alias inexactly") - } + ensure(len(dst) == len(plaintext), "crypto/aes: invalid destination ciphertext size") + ensure(!bytes.alias_inexactly(dst, plaintext), "crypto/aes: dst and plaintext alias inexactly") if impl, is_hw := ctx._impl.(Context_Impl_Hardware); is_hw { gcm_seal_hw(&impl, dst, tag, iv, aad, plaintext) @@ -76,15 +72,11 @@ seal_gcm :: proc(ctx: ^Context_GCM, dst, tag, iv, aad, plaintext: []byte) { // dst and plaintext MUST alias exactly or not at all. @(require_results) open_gcm :: proc(ctx: ^Context_GCM, dst, iv, aad, ciphertext, tag: []byte) -> bool { - assert(ctx._is_initialized) + ensure(ctx._is_initialized) gcm_validate_common_slice_sizes(tag, iv, aad, ciphertext) - if len(dst) != len(ciphertext) { - panic("crypto/aes: invalid destination plaintext size") - } - if bytes.alias_inexactly(dst, ciphertext) { - panic("crypto/aes: dst and ciphertext alias inexactly") - } + ensure(len(dst) == len(ciphertext), "crypto/aes: invalid destination plaintext size") + ensure(!bytes.alias_inexactly(dst, ciphertext), "crypto/aes: dst and ciphertext alias inexactly") if impl, is_hw := ctx._impl.(Context_Impl_Hardware); is_hw { return gcm_open_hw(&impl, dst, iv, aad, ciphertext, tag) @@ -122,21 +114,13 @@ reset_gcm :: proc "contextless" (ctx: ^Context_GCM) { @(private = "file") gcm_validate_common_slice_sizes :: proc(tag, iv, aad, text: []byte) { - if len(tag) != GCM_TAG_SIZE { - panic("crypto/aes: invalid GCM tag size") - } + ensure(len(tag) == GCM_TAG_SIZE, "crypto/aes: invalid GCM tag size") // The specification supports IVs in the range [1, 2^64) bits. - if l := len(iv); l == 0 || u64(l) >= GCM_IV_SIZE_MAX { - panic("crypto/aes: invalid GCM IV size") - } + ensure(len(iv) == 0 || u64(len(iv)) <= GCM_IV_SIZE_MAX, "crypto/aes: invalid GCM IV size") - if aad_len := u64(len(aad)); aad_len > GCM_A_MAX { - panic("crypto/aes: oversized GCM aad") - } - if text_len := u64(len(text)); text_len > GCM_P_MAX { - panic("crypto/aes: oversized GCM src data") - } + ensure(u64(len(aad)) <= GCM_A_MAX, "crypto/aes: oversized GCM aad") + ensure(u64(len(text)) <= GCM_P_MAX, "crypto/aes: oversized GCM data") } @(private = "file") diff --git a/core/crypto/aes/aes_gcm_hw_intel.odin b/core/crypto/aes/aes_gcm_hw_intel.odin index 4cb5ab3b2..3982d1452 100644 --- a/core/crypto/aes/aes_gcm_hw_intel.odin +++ b/core/crypto/aes/aes_gcm_hw_intel.odin @@ -235,7 +235,7 @@ gctr_hw :: proc( // BUG: Sticking this in gctr_hw (like the other implementations) crashes // the compiler. // -// src/check_expr.cpp(7892): Assertion Failure: `c->curr_proc_decl->entity` +// src/check_expr.cpp(8104): Assertion Failure: `c->curr_proc_decl->entity` @(private = "file", enable_target_feature = "sse4.1") hw_inc_ctr32 :: #force_inline proc "contextless" (src: ^x86.__m128i, ctr: u32) -> (x86.__m128i, u32) { ret := x86._mm_insert_epi32(src^, i32(intrinsics.byte_swap(ctr)), 3) diff --git a/core/crypto/blake2b/blake2b.odin b/core/crypto/blake2b/blake2b.odin index 74396b103..3b3fc6649 100644 --- a/core/crypto/blake2b/blake2b.odin +++ b/core/crypto/blake2b/blake2b.odin @@ -18,7 +18,7 @@ package blake2b import "../_blake2" // DIGEST_SIZE is the BLAKE2b digest size in bytes. -DIGEST_SIZE :: 64 +DIGEST_SIZE :: _blake2.BLAKE2B_SIZE // BLOCK_SIZE is the BLAKE2b block size in bytes. BLOCK_SIZE :: _blake2.BLAKE2B_BLOCK_SIZE @@ -27,9 +27,11 @@ BLOCK_SIZE :: _blake2.BLAKE2B_BLOCK_SIZE Context :: _blake2.Blake2b_Context // init initializes a Context with the default BLAKE2b config. -init :: proc(ctx: ^Context) { +init :: proc(ctx: ^Context, digest_size := DIGEST_SIZE) { + ensure(digest_size <= _blake2.MAX_SIZE, "crypto/blake2b: invalid digest size") + cfg: _blake2.Blake2_Config - cfg.size = _blake2.BLAKE2B_SIZE + cfg.size = u8(digest_size) _blake2.init(ctx, &cfg) } diff --git a/core/crypto/blake2s/blake2s.odin b/core/crypto/blake2s/blake2s.odin index 339ddf027..9bbd44541 100644 --- a/core/crypto/blake2s/blake2s.odin +++ b/core/crypto/blake2s/blake2s.odin @@ -18,7 +18,7 @@ package blake2s import "../_blake2" // DIGEST_SIZE is the BLAKE2s digest size in bytes. -DIGEST_SIZE :: 32 +DIGEST_SIZE :: _blake2.BLAKE2S_SIZE // BLOCK_SIZE is the BLAKE2s block size in bytes. BLOCK_SIZE :: _blake2.BLAKE2S_BLOCK_SIZE @@ -27,9 +27,11 @@ BLOCK_SIZE :: _blake2.BLAKE2S_BLOCK_SIZE Context :: _blake2.Blake2s_Context // init initializes a Context with the default BLAKE2s config. -init :: proc(ctx: ^Context) { +init :: proc(ctx: ^Context, digest_size := DIGEST_SIZE) { + ensure(digest_size <= _blake2.MAX_SIZE, "crypto/blake2s: invalid digest size") + cfg: _blake2.Blake2_Config - cfg.size = _blake2.BLAKE2S_SIZE + cfg.size = u8(digest_size) _blake2.init(ctx, &cfg) } diff --git a/core/crypto/chacha20/chacha20.odin b/core/crypto/chacha20/chacha20.odin index dfab2bc65..e8d67eb3e 100644 --- a/core/crypto/chacha20/chacha20.odin +++ b/core/crypto/chacha20/chacha20.odin @@ -27,12 +27,8 @@ Context :: struct { // init inititializes a Context for ChaCha20 or XChaCha20 with the provided // key and iv. init :: proc(ctx: ^Context, key, iv: []byte, impl := DEFAULT_IMPLEMENTATION) { - if len(key) != KEY_SIZE { - panic("crypto/chacha20: invalid (X)ChaCha20 key size") - } - if l := len(iv); l != IV_SIZE && l != XIV_SIZE { - panic("crypto/chacha20: invalid (X)ChaCha20 IV size") - } + ensure(len(key) == KEY_SIZE, "crypto/chacha20: invalid (X)ChaCha20 key size") + ensure(len(iv) == IV_SIZE || len(iv) == XIV_SIZE, "crypto/chacha20: invalid (X)ChaCha20 IV size") k, n := key, iv @@ -67,16 +63,14 @@ seek :: proc(ctx: ^Context, block_nr: u64) { // keystream, and writes the resulting output to dst. Dst and src MUST // alias exactly or not at all. xor_bytes :: proc(ctx: ^Context, dst, src: []byte) { - assert(ctx._state._is_initialized) + ensure(ctx._state._is_initialized) src, dst := src, dst if dst_len := len(dst); dst_len < len(src) { src = src[:dst_len] } - if bytes.alias_inexactly(dst, src) { - panic("crypto/chacha20: dst and src alias inexactly") - } + ensure(!bytes.alias_inexactly(dst, src), "crypto/chacha20: dst and src alias inexactly") st := &ctx._state #no_bounds_check for remaining := len(src); remaining > 0; { @@ -114,7 +108,7 @@ xor_bytes :: proc(ctx: ^Context, dst, src: []byte) { // keystream_bytes fills dst with the raw (X)ChaCha20 keystream output. keystream_bytes :: proc(ctx: ^Context, dst: []byte) { - assert(ctx._state._is_initialized) + ensure(ctx._state._is_initialized) dst, st := dst, &ctx._state #no_bounds_check for remaining := len(dst); remaining > 0; { diff --git a/core/crypto/chacha20poly1305/chacha20poly1305.odin b/core/crypto/chacha20poly1305/chacha20poly1305.odin index 3de2532dd..6706b3820 100644 --- a/core/crypto/chacha20poly1305/chacha20poly1305.odin +++ b/core/crypto/chacha20poly1305/chacha20poly1305.odin @@ -29,13 +29,9 @@ _P_MAX :: 64 * 0xffffffff // 64 * (2^32-1) @(private) _validate_common_slice_sizes :: proc (tag, iv, aad, text: []byte, is_xchacha: bool) { - if len(tag) != TAG_SIZE { - panic("crypto/chacha20poly1305: invalid destination tag size") - } expected_iv_len := is_xchacha ? XIV_SIZE : IV_SIZE - if len(iv) != expected_iv_len { - panic("crypto/chacha20poly1305: invalid IV size") - } + ensure(len(tag) == TAG_SIZE, "crypto/chacha20poly1305: invalid destination tag size") + ensure(len(iv) == expected_iv_len, "crypto/chacha20poly1305: invalid IV size") #assert(size_of(int) == 8 || size_of(int) <= 4) when size_of(int) == 8 { @@ -45,13 +41,11 @@ _validate_common_slice_sizes :: proc (tag, iv, aad, text: []byte, is_xchacha: bo // A_MAX is limited by size_of(int), so there is no need to // enforce it. P_MAX only needs to be checked on 64-bit targets, // for reasons that should be obvious. - if text_len := len(text); text_len > _P_MAX { - panic("crypto/chacha20poly1305: oversized src data") - } + ensure(len(text) <= _P_MAX, "crypto/chacha20poly1305: oversized src data") } } -@(private) +@(private, rodata) _PAD: [16]byte @(private) @@ -71,9 +65,7 @@ Context :: struct { // init initializes a Context with the provided key, for AEAD_CHACHA20_POLY1305. init :: proc(ctx: ^Context, key: []byte, impl := chacha20.DEFAULT_IMPLEMENTATION) { - if len(key) != KEY_SIZE { - panic("crypto/chacha20poly1305: invalid key size") - } + ensure(len(key) == KEY_SIZE, "crypto/chacha20poly1305: invalid key size") copy(ctx._key[:], key) ctx._impl = impl @@ -96,11 +88,11 @@ init_xchacha :: proc(ctx: ^Context, key: []byte, impl := chacha20.DEFAULT_IMPLEM // // dst and plaintext MUST alias exactly or not at all. seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) { + ensure(ctx._is_initialized) + ciphertext := dst _validate_common_slice_sizes(tag, iv, aad, plaintext, ctx._is_xchacha) - if len(ciphertext) != len(plaintext) { - panic("crypto/chacha20poly1305: invalid destination ciphertext size") - } + ensure(len(ciphertext) == len(plaintext), "crypto/chacha20poly1305: invalid destination ciphertext size") stream_ctx: chacha20.Context = --- chacha20.init(&stream_ctx, ctx._key[:],iv, ctx._impl) @@ -151,11 +143,11 @@ seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) { // dst and plaintext MUST alias exactly or not at all. @(require_results) open :: proc(ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool { + ensure(ctx._is_initialized) + plaintext := dst _validate_common_slice_sizes(tag, iv, aad, ciphertext, ctx._is_xchacha) - if len(ciphertext) != len(plaintext) { - panic("crypto/chacha20poly1305: invalid destination plaintext size") - } + ensure(len(ciphertext) == len(plaintext), "crypto/chacha20poly1305: invalid destination plaintext size") // Note: Unlike encrypt, this can fail early, so use defer for // sanitization rather than assuming control flow reaches certain diff --git a/core/crypto/deoxysii/deoxysii.odin b/core/crypto/deoxysii/deoxysii.odin new file mode 100644 index 000000000..cead770e2 --- /dev/null +++ b/core/crypto/deoxysii/deoxysii.odin @@ -0,0 +1,280 @@ +/* +package deoxysii implements the Deoxys-II-256 Authenticated Encryption +with Additional Data algorithm. + +- [[ https://sites.google.com/view/deoxyscipher ]] +- [[ https://thomaspeyrin.github.io/web/assets/docs/papers/Jean-etal-JoC2021.pdf ]] +*/ +package deoxysii + +import "base:intrinsics" +import "core:bytes" +import "core:crypto/aes" +import "core:mem" +import "core:simd" + +// KEY_SIZE is the Deoxys-II-256 key size in bytes. +KEY_SIZE :: 32 +// IV_SIZE iss the Deoxys-II-256 IV size in bytes. +IV_SIZE :: 15 // 120-bits +// TAG_SIZE is the Deoxys-II-256 tag size in bytes. +TAG_SIZE :: 16 + +@(private) +PREFIX_AD_BLOCK :: 0b0010 +@(private) +PREFIX_AD_FINAL :: 0b0110 +@(private) +PREFIX_MSG_BLOCK :: 0b0000 +@(private) +PREFIX_MSG_FINAL :: 0b0100 +@(private) +PREFIX_TAG :: 0b0001 +@(private) +PREFIX_SHIFT :: 4 + +@(private) +BC_ROUNDS :: 16 +@(private) +BLOCK_SIZE :: aes.BLOCK_SIZE + +@(private = "file") +_LFSR2_MASK :: simd.u8x16{ + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, +} +@(private = "file") +_LFSR3_MASK :: simd.u8x16{ + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +} +@(private = "file") +_LFSR_SH1 :: _LFSR2_MASK +@(private = "file") +_LFSR_SH5 :: simd.u8x16{ + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, +} +@(private = "file") +_LFSR_SH7 :: simd.u8x16{ + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, +} +@(private = "file", rodata) +_RCONS := []byte { + 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, + 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, + 0x72, +} + +// Context is a keyed Deoxys-II-256 instance. +Context :: struct { + _subkeys: [BC_ROUNDS+1][16]byte, + _impl: aes.Implementation, + _is_initialized: bool, +} + +@(private) +_validate_common_slice_sizes :: proc (ctx: ^Context, tag, iv, aad, text: []byte) { + ensure(len(tag) == TAG_SIZE, "crypto/deoxysii: invalid tag size") + ensure(len(iv) == IV_SIZE, "crypto/deoxysii: invalid IV size") + + #assert(size_of(int) == 8 || size_of(int) <= 4) + // For the nonce-misuse resistant mode, the total size of the + // associated data and the total size of the message do not exceed + // `16 * 2^max_l * 2^max_m bytes`, thus 2^128 bytes for all variants + // of Deoxys-II. Moreover, the maximum number of messages that can + // be handled for a same key is 2^max_m, that is 2^64 for all variants + // of Deoxys. +} + +// init initializes a Context with the provided key. +init :: proc(ctx: ^Context, key: []byte, impl := aes.DEFAULT_IMPLEMENTATION) { + ensure(len(key) == KEY_SIZE, "crypto/deoxysii: invalid key size") + + ctx._impl = impl + if ctx._impl == .Hardware && !is_hardware_accelerated() { + ctx._impl = .Portable + } + + derive_ks(ctx, key) + + ctx._is_initialized = true +} + +// seal encrypts the plaintext and authenticates the aad and ciphertext, +// with the provided Context and iv, stores the output in dst and tag. +// +// dst and plaintext MUST alias exactly or not at all. +seal :: proc(ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) { + ensure(ctx._is_initialized) + + _validate_common_slice_sizes(ctx, tag, iv, aad, plaintext) + ensure(len(dst) == len(plaintext), "crypto/deoxysii: invalid destination ciphertext size") + ensure(!bytes.alias_inexactly(dst, plaintext), "crypto/deoxysii: dst and plaintext alias inexactly") + + switch ctx._impl { + case .Hardware: + e_hw(ctx, dst, tag, iv, aad, plaintext) + case .Portable: + e_ref(ctx, dst, tag, iv, aad, plaintext) + } +} + +// open authenticates the aad and ciphertext, and decrypts the ciphertext, +// with the provided Context, iv, and tag, and stores the output in dst, +// returning true iff the authentication was successful. If authentication +// fails, the destination buffer will be zeroed. +// +// dst and plaintext MUST alias exactly or not at all. +@(require_results) +open :: proc(ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool { + ensure(ctx._is_initialized) + + _validate_common_slice_sizes(ctx, tag, iv, aad, ciphertext) + ensure(len(dst) == len(ciphertext), "crypto/deoxysii: invalid destination plaintext size") + ensure(!bytes.alias_inexactly(dst, ciphertext), "crypto/deoxysii: dst and ciphertext alias inexactly") + + ok: bool + switch ctx._impl { + case .Hardware: + ok = d_hw(ctx, dst, iv, aad, ciphertext, tag) + case .Portable: + ok = d_ref(ctx, dst, iv, aad, ciphertext, tag) + } + if !ok { + mem.zero_explicit(raw_data(dst), len(ciphertext)) + } + + return ok +} + +// reset sanitizes the Context. The Context must be +// re-initialized to be used again. +reset :: proc "contextless" (ctx: ^Context) { + mem.zero_explicit(&ctx._subkeys, len(ctx._subkeys)) + ctx._is_initialized = false +} + +@(private = "file") +derive_ks :: proc "contextless" (ctx: ^Context, key: []byte) { + // Derive the constant component of each subtweakkey. + // + // The key schedule is as thus: + // + // STK_i = TK1_i ^ TK2_i ^ TK3_i ^ RC_i + // + // TK1_i = h(TK1_(i-1)) + // TK2_i = h(LFSR2(TK2_(i-1))) + // TK3_i = h(LFSR3(TK2_(i-1))) + // + // where: + // + // KT = K || T + // W3 = KT[:16] + // W2 = KT[16:32] + // W1 = KT[32:] + // + // TK1_0 = W1 + // TK2_0 = W2 + // TK3_0 = W3 + // + // As `K` is fixed per Context, the XORs of `TK3_0 .. TK3_n`, + // `TK2_0 .. TK2_n` and RC_i can be precomputed in advance like + // thus: + // + // subkey_i = TK3_i ^ TK2_i ^ RC_i + // + // When it is time to actually call Deoxys-BC-384, it is then + // a simple matter of deriving each round subtweakkey via: + // + // TK1_0 = T (Tweak) + // STK_0 = subkey_0 ^ TK1_0 + // STK_i = subkey_i (precomputed) ^ H(TK1_(i-1)) + // + // We opt to use SIMD here and for the subtweakkey deriviation + // as `H()` is typically a single vector instruction. + + tk2 := intrinsics.unaligned_load((^simd.u8x16)(raw_data(key[16:]))) + tk3 := intrinsics.unaligned_load((^simd.u8x16)(raw_data(key))) + + // subkey_0 does not apply LFSR2/3 or H. + intrinsics.unaligned_store( + (^simd.u8x16)(&ctx._subkeys[0]), + simd.bit_xor( + tk2, + simd.bit_xor( + tk3, + rcon(0), + ), + ), + ) + + // Precompute k_1 .. k_16. + for i in 1 ..< BC_ROUNDS+1 { + tk2 = h(lfsr2(tk2)) + tk3 = h(lfsr3(tk3)) + intrinsics.unaligned_store( + (^simd.u8x16)(&ctx._subkeys[i]), + simd.bit_xor( + tk2, + simd.bit_xor( + tk3, + rcon(i), + ), + ), + ) + } +} + +@(private = "file") +lfsr2 :: #force_inline proc "contextless" (tk: simd.u8x16) -> simd.u8x16 { + // LFSR2 is a application of the following LFSR to each byte of input. + // (x7||x6||x5||x4||x3||x2||x1||x0) -> (x6||x5||x4||x3||x2||x1||x0||x7 ^ x5) + return simd.bit_or( + simd.shl(tk, _LFSR_SH1), + simd.bit_and( + simd.bit_xor( + simd.shr(tk, _LFSR_SH7), // x7 + simd.shr(tk, _LFSR_SH5), // x5 + ), + _LFSR2_MASK, + ), + ) +} + +@(private = "file") +lfsr3 :: #force_inline proc "contextless" (tk: simd.u8x16) -> simd.u8x16 { + // LFSR3 is a application of the following LFSR to each byte of input. + // (x7||x6||x5||x4||x3||x2||x1||x0) -> (x0 ^ x6||x7||x6||x5||x4||x3||x2||x1) + return simd.bit_or( + simd.shr(tk, _LFSR_SH1), + simd.bit_and( + simd.bit_xor( + simd.shl(tk, _LFSR_SH7), // x0 + simd.shl(tk, _LFSR_SH1), // x6 + ), + _LFSR3_MASK, + ), + ) +} + +@(private) +h :: #force_inline proc "contextless" (tk: simd.u8x16) -> simd.u8x16 { + return simd.swizzle( + tk, + 0x01, 0x06, 0x0b, 0x0c, 0x05, 0x0a, 0x0f, 0x00, + 0x09, 0x0e, 0x03, 0x04, 0x0d, 0x02, 0x07, 0x08, + ) +} + +@(private = "file") +rcon :: #force_inline proc "contextless" (rd: int) -> simd.u8x16 #no_bounds_check { + rc := _RCONS[rd] + return simd.u8x16{ + 1, 2, 4, 8, + rc, rc, rc, rc, + 0, 0, 0, 0, + 0, 0, 0, 0, + } +} \ No newline at end of file diff --git a/core/crypto/deoxysii/deoxysii_impl_ct64.odin b/core/crypto/deoxysii/deoxysii_impl_ct64.odin new file mode 100644 index 000000000..c4d0edb03 --- /dev/null +++ b/core/crypto/deoxysii/deoxysii_impl_ct64.odin @@ -0,0 +1,399 @@ +package deoxysii + +import "base:intrinsics" +import "core:crypto" +import aes "core:crypto/_aes/ct64" +import "core:encoding/endian" +import "core:mem" +import "core:simd" + +// This uses the bitlsiced 64-bit general purpose register SWAR AES +// round function. The encryption pass skips orthogonalizing the +// AES round function input as it is aways going to be the leading 0 +// padded IV, and doing a 64-byte copy is faster. + +@(private = "file") +TWEAK_SIZE :: 16 + +@(private = "file") +State_SW :: struct { + ctx: ^Context, + q_stk, q_b: [8]u64, +} + +@(private = "file") +auth_tweak :: #force_inline proc "contextless" ( + dst: ^[TWEAK_SIZE]byte, + prefix: byte, + block_nr: int, +) { + endian.unchecked_put_u64be(dst[8:], u64(block_nr)) + endian.unchecked_put_u64le(dst[0:], u64(prefix) << PREFIX_SHIFT) // dst[0] = prefix << PREFIX_SHIFT +} + +@(private = "file") +enc_tweak :: #force_inline proc "contextless" ( + dst: ^[TWEAK_SIZE]byte, + tag: ^[TAG_SIZE]byte, + block_nr: int, +) { + tmp: [8]byte + endian.unchecked_put_u64be(tmp[:], u64(block_nr)) + + copy(dst[:], tag[:]) + dst[0] |= 0x80 + for i in 0 ..< 8 { + dst[i+8] ~= tmp[i] + } +} + +@(private = "file") +enc_plaintext :: #force_inline proc "contextless" ( + dst: ^[8]u64, + iv: []byte, +) { + tmp: [BLOCK_SIZE]byte = --- + tmp[0] = 0 + copy(tmp[1:], iv[:]) + + q_0, q_1 := aes.load_interleaved(tmp[:]) + for i in 0 ..< 4 { + dst[i], dst[i+4] = q_0, q_1 + } + aes.orthogonalize(dst) +} + +@(private = "file") +bc_x4 :: proc "contextless" ( + ctx: ^Context, + dst: []byte, + tweaks: ^[4][TWEAK_SIZE]byte, + q_stk: ^[8]u64, + q_b: ^[8]u64, // Orthogonalized + n: int, +) { + tk1s: [4]simd.u8x16 + for j in 0 ..< n { + tk1s[j] = intrinsics.unaligned_load((^simd.u8x16)(&tweaks[j])) + } + + // Deoxys-BC-384 + for i in 0 ..= BC_ROUNDS { + // Derive the round's subtweakkey + sk := intrinsics.unaligned_load((^simd.u8x16)(&ctx._subkeys[i])) + for j in 0 ..< n { + if i != 0 { + tk1s[j] = h(tk1s[j]) + } + intrinsics.unaligned_store( + (^simd.u8x16)(raw_data(dst)), + simd.bit_xor(sk, tk1s[j]), + ) + q_stk[j], q_stk[j+4] = aes.load_interleaved(dst[:]) + } + aes.orthogonalize(q_stk) + + if i != 0 { + aes.sub_bytes(q_b) + aes.shift_rows(q_b) + aes.mix_columns(q_b) + } + aes.add_round_key(q_b, q_stk[:]) + } + + aes.orthogonalize(q_b) + for i in 0 ..< n { + aes.store_interleaved(dst[i*BLOCK_SIZE:], q_b[i], q_b[i+4]) + } +} + +@(private = "file", require_results) +bc_absorb :: proc "contextless" ( + st: ^State_SW, + dst: []byte, + src: []byte, + tweak_prefix: byte, + stk_block_nr: int, +) -> int { + tweaks: [4][TWEAK_SIZE]byte = --- + tmp: [BLOCK_SIZE*4]byte = --- + + src, stk_block_nr := src, stk_block_nr + dst_ := intrinsics.unaligned_load((^simd.u8x16)(raw_data(dst))) + + nr_blocks := len(src) / BLOCK_SIZE + for nr_blocks > 0 { + // Derive the tweak(s), orthogonalize the plaintext + n := min(nr_blocks, 4) + for i in 0 ..< n { + auth_tweak(&tweaks[i], tweak_prefix, stk_block_nr + i) + st.q_b[i], st.q_b[i + 4] = aes.load_interleaved(src) + src = src[BLOCK_SIZE:] + } + aes.orthogonalize(&st.q_b) + + // Deoxys-BC-384 + bc_x4(st.ctx, tmp[:], &tweaks, &st.q_stk, &st.q_b, n) + + // XOR in the existing Auth/tag + for i in 0 ..< n { + dst_ = simd.bit_xor( + dst_, + intrinsics.unaligned_load((^simd.u8x16)(raw_data(tmp[i*BLOCK_SIZE:]))), + ) + } + + stk_block_nr += n + nr_blocks -= n + } + + intrinsics.unaligned_store((^simd.u8x16)(raw_data(dst)), dst_) + + mem.zero_explicit(&tweaks, size_of(tweaks)) + mem.zero_explicit(&tmp, size_of(tmp)) + + return stk_block_nr +} + +@(private = "file") +bc_final :: proc "contextless" ( + st: ^State_SW, + dst: []byte, + iv: []byte, +) { + tweaks: [4][TWEAK_SIZE]byte = --- + + tweaks[0][0] = PREFIX_TAG << PREFIX_SHIFT + copy(tweaks[0][1:], iv) + + st.q_b[0], st.q_b[4] = aes.load_interleaved(dst) + aes.orthogonalize(&st.q_b) + + bc_x4(st.ctx, dst, &tweaks, &st.q_stk, &st.q_b, 1) +} + +@(private = "file", require_results) +bc_encrypt :: proc "contextless" ( + st: ^State_SW, + dst: []byte, + src: []byte, + q_n: ^[8]u64, // Orthogonalized + tweak_tag: ^[TAG_SIZE]byte, + stk_block_nr: int, +) -> int { + tweaks: [4][TWEAK_SIZE]byte = --- + tmp: [BLOCK_SIZE*4]byte = --- + + dst, src, stk_block_nr := dst, src, stk_block_nr + + nr_blocks := len(src) / BLOCK_SIZE + for nr_blocks > 0 { + // Derive the tweak(s) + n := min(nr_blocks, 4) + for i in 0 ..< n { + enc_tweak(&tweaks[i], tweak_tag, stk_block_nr + i) + } + st.q_b = q_n^ // The plaintext is always `0^8 || N` + + // Deoxys-BC-384 + bc_x4(st.ctx, tmp[:], &tweaks, &st.q_stk, &st.q_b, n) + + // XOR the ciphertext + for i in 0 ..< n { + intrinsics.unaligned_store( + (^simd.u8x16)(raw_data(dst[i*BLOCK_SIZE:])), + simd.bit_xor( + intrinsics.unaligned_load((^simd.u8x16)(raw_data(src[i*BLOCK_SIZE:]))), + intrinsics.unaligned_load((^simd.u8x16)(raw_data(tmp[i*BLOCK_SIZE:]))), + ), + ) + } + + dst, src = dst[n*BLOCK_SIZE:], src[n*BLOCK_SIZE:] + stk_block_nr += n + nr_blocks -= n + } + + mem.zero_explicit(&tweaks, size_of(tweaks)) + mem.zero_explicit(&tmp, size_of(tmp)) + + return stk_block_nr +} + +@(private) +e_ref :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) #no_bounds_check { + st: State_SW = --- + st.ctx = ctx + + // Algorithm 3 + // + // Associated data + // A_1 || ... || A_la || A_∗ <- A where each |A_i| = n and |A_∗| < n + // Auth <- 0^n + // for i = 0 to la − 1 do + // Auth <- Auth ^ EK(0010 || i, A_i+1) + // end + // if A_∗ != nil then + // Auth <- Auth ^ EK(0110 || la, pad10∗(A_∗)) + // end + auth: [TAG_SIZE]byte + aad := aad + n := bc_absorb(&st, auth[:], aad, PREFIX_AD_BLOCK, 0) + aad = aad[n*BLOCK_SIZE:] + if l := len(aad); l > 0 { + a_star: [BLOCK_SIZE]byte + + copy(a_star[:], aad) + a_star[l] = 0x80 + + _ = bc_absorb(&st, auth[:], a_star[:], PREFIX_AD_FINAL, n) + } + + // Message authentication and tag generation + // M_1 || ... || M_l || M_∗ <- M where each |M_j| = n and |M_∗| < n + // tag <- Auth + // for j = 0 to l − 1 do + // tag <- tag ^ EK(0000 || j, M_j+1) + // end + // if M_∗ != nil then + // tag <- tag ^ EK(0100 || l, pad10∗(M_∗)) + // end + // tag <- EK(0001 || 0^4 || N, tag) + m := plaintext + n = bc_absorb(&st, auth[:], m, PREFIX_MSG_BLOCK, 0) + m = m[n*BLOCK_SIZE:] + if l := len(m); l > 0 { + m_star: [BLOCK_SIZE]byte + + copy(m_star[:], m) + m_star[l] = 0x80 + + _ = bc_absorb(&st, auth[:], m_star[:], PREFIX_MSG_FINAL, n) + } + bc_final(&st, auth[:], iv) + + // Message encryption + // for j = 0 to l − 1 do + // C_j <- M_j ^ EK(1 || tag ^ j, 0^8 || N) + // end + // if M_∗ != nil then + // C_∗ <- M_* ^ EK(1 || tag ^ l, 0^8 || N) + // end + // + // return (C_1 || ... || C_l || C_∗, tag) + q_iv: [8]u64 = --- + enc_plaintext(&q_iv, iv) + + m = plaintext + n = bc_encrypt(&st, dst, m, &q_iv, &auth, 0) + m = m[n*BLOCK_SIZE:] + if l := len(m); l > 0 { + m_star: [BLOCK_SIZE]byte + + copy(m_star[:], m) + _ = bc_encrypt(&st, m_star[:], m_star[:], &q_iv, &auth, n) + + copy(dst[n*BLOCK_SIZE:], m_star[:]) + + mem.zero_explicit(&m_star, size_of(m_star)) + } + + copy(tag, auth[:]) + + mem.zero_explicit(&st.q_stk, size_of(st.q_stk)) + mem.zero_explicit(&st.q_b, size_of(st.q_b)) +} + +@(private, require_results) +d_ref :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool { + st: State_SW = --- + st.ctx = ctx + + // Algorithm 4 + // + // Message decryption + // C_1 || ... || C_l || C_∗ <- C where each |C_j| = n and |C_∗| < n + // for j = 0 to l − 1 do + // M_j <- C_j ^ EK(1 || tag ^ j, 0^8 || N) + // end + // if C_∗ != nil then + // M_∗ <- C_∗ ^ EK(1 || tag ^ l, 0^8 || N) + // end + q_iv: [8]u64 = --- + enc_plaintext(&q_iv, iv) + + auth: [TAG_SIZE]byte + copy(auth[:], tag) + + m := ciphertext + n := bc_encrypt(&st, dst, m, &q_iv, &auth, 0) + m = m[n*BLOCK_SIZE:] + if l := len(m); l > 0 { + m_star: [BLOCK_SIZE]byte + + copy(m_star[:], m) + _ = bc_encrypt(&st, m_star[:], m_star[:], &q_iv, &auth, n) + + copy(dst[n*BLOCK_SIZE:], m_star[:]) + + mem.zero_explicit(&m_star, size_of(m_star)) + } + + // Associated data + // A_1 || ... || Al_a || A_∗ <- A where each |Ai_| = n and |A_∗| < n + // Auth <- 0 + // for i = 0 to la − 1 do + // Auth <- Auth ^ EK(0010 || i, A_i+1) + // end + // if A∗ != nil then + // Auth <- Auth ^ EK(0110| | l_a, pad10∗(A_∗)) + // end + auth = 0 + aad := aad + n = bc_absorb(&st, auth[:], aad, PREFIX_AD_BLOCK, 0) + aad = aad[n*BLOCK_SIZE:] + if l := len(aad); l > 0 { + a_star: [BLOCK_SIZE]byte + + copy(a_star[:], aad) + a_star[l] = 0x80 + + _ = bc_absorb(&st, auth[:], a_star[:], PREFIX_AD_FINAL, n) + } + + // Message authentication and tag generation + // M_1 || ... || M_l || M_∗ <- M where each |M_j| = n and |M_∗| < n + // tag0 <- Auth + // for j = 0 to l − 1 do + // tag0 <- tag0 ^ EK(0000 || j, M_j+1) + // end + // if M_∗ != nil then + // tag0 <- tag0 ^ EK(0100 || l, pad10∗(M_∗)) + // end + // tag0 <- EK(0001 || 0^4 || N, tag0) + m = dst[:len(ciphertext)] + n = bc_absorb(&st, auth[:], m, PREFIX_MSG_BLOCK, 0) + m = m[n*BLOCK_SIZE:] + if l := len(m); l > 0 { + m_star: [BLOCK_SIZE]byte + + copy(m_star[:], m) + m_star[l] = 0x80 + + _ = bc_absorb(&st, auth[:], m_star[:], PREFIX_MSG_FINAL, n) + + mem.zero_explicit(&m_star, size_of(m_star)) + } + bc_final(&st, auth[:], iv) + + // Tag verification + // if tag0 = tag then return (M_1 || ... || M_l || M_∗) + // else return false + ok := crypto.compare_constant_time(auth[:], tag) == 1 + + mem.zero_explicit(&auth, size_of(auth)) + mem.zero_explicit(&st.q_stk, size_of(st.q_stk)) + mem.zero_explicit(&st.q_b, size_of(st.q_b)) + + return ok +} diff --git a/core/crypto/deoxysii/deoxysii_impl_hw_gen.odin b/core/crypto/deoxysii/deoxysii_impl_hw_gen.odin new file mode 100644 index 000000000..b0705ca62 --- /dev/null +++ b/core/crypto/deoxysii/deoxysii_impl_hw_gen.odin @@ -0,0 +1,21 @@ +#+build !amd64 +package deoxysii + +@(private = "file") +ERR_HW_NOT_SUPPORTED :: "crypto/deoxysii: hardware implementation unsupported" + +// is_hardware_accelerated returns true iff hardware accelerated Deoxys-II +// is supported. +is_hardware_accelerated :: proc "contextless" () -> bool { + return false +} + +@(private) +e_hw :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) #no_bounds_check { + panic_contextless(ERR_HW_NOT_SUPPORTED) +} + +@(private, require_results) +d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool { + panic_contextless(ERR_HW_NOT_SUPPORTED) +} diff --git a/core/crypto/deoxysii/deoxysii_impl_hw_intel.odin b/core/crypto/deoxysii/deoxysii_impl_hw_intel.odin new file mode 100644 index 000000000..d268009a2 --- /dev/null +++ b/core/crypto/deoxysii/deoxysii_impl_hw_intel.odin @@ -0,0 +1,434 @@ +#+build amd64 +package deoxysii + +import "base:intrinsics" +import "core:crypto" +import "core:crypto/aes" +import "core:mem" +import "core:simd" +import "core:simd/x86" + +// This processes a maximum of 4 blocks at a time, as that is suitable +// for most current hardware that doesn't say "Xeon". + +@(private = "file") +_BIT_ENC :: x86.__m128i{0x80, 0} +@(private = "file") +_PREFIX_AD_BLOCK :: x86.__m128i{PREFIX_AD_BLOCK << PREFIX_SHIFT, 0} +@(private = "file") +_PREFIX_AD_FINAL :: x86.__m128i{PREFIX_AD_FINAL << PREFIX_SHIFT, 0} +@(private = "file") +_PREFIX_MSG_BLOCK :: x86.__m128i{PREFIX_MSG_BLOCK << PREFIX_SHIFT, 0} +@(private = "file") +_PREFIX_MSG_FINAL :: x86.__m128i{PREFIX_MSG_FINAL << PREFIX_SHIFT, 0} + +// is_hardware_accelerated returns true iff hardware accelerated Deoxys-II +// is supported. +is_hardware_accelerated :: proc "contextless" () -> bool { + return aes.is_hardware_accelerated() +} + +@(private = "file", enable_target_feature = "sse4.1", require_results) +auth_tweak :: #force_inline proc "contextless" ( + prefix: x86.__m128i, + block_nr: int, +) -> x86.__m128i { + return x86._mm_insert_epi64(prefix, i64(intrinsics.byte_swap(u64(block_nr))), 1) +} + +@(private = "file", enable_target_feature = "sse2", require_results) +enc_tweak :: #force_inline proc "contextless" ( + tag: x86.__m128i, + block_nr: int, +) -> x86.__m128i { + return x86._mm_xor_si128( + x86._mm_or_si128(tag, _BIT_ENC), + x86.__m128i{0, i64(intrinsics.byte_swap(u64(block_nr)))}, + ) +} + +@(private = "file", enable_target_feature = "ssse3", require_results) +h_ :: #force_inline proc "contextless" (tk1: x86.__m128i) -> x86.__m128i { + return transmute(x86.__m128i)h(transmute(simd.u8x16)tk1) +} + +@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results) +bc_x4 :: #force_inline proc "contextless" ( + ctx: ^Context, + s_0, s_1, s_2, s_3: x86.__m128i, + tweak_0, tweak_1, tweak_2, tweak_3: x86.__m128i, +) -> (x86.__m128i, x86.__m128i, x86.__m128i, x86.__m128i) #no_bounds_check { + s_0, s_1, s_2, s_3 := s_0, s_1, s_2, s_3 + tk1_0, tk1_1, tk1_2, tk1_3 := tweak_0, tweak_1, tweak_2, tweak_3 + + sk := intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[0])) + stk_0 := x86._mm_xor_si128(tk1_0, sk) + stk_1 := x86._mm_xor_si128(tk1_1, sk) + stk_2 := x86._mm_xor_si128(tk1_2, sk) + stk_3 := x86._mm_xor_si128(tk1_3, sk) + + s_0 = x86._mm_xor_si128(s_0, stk_0) + s_1 = x86._mm_xor_si128(s_1, stk_1) + s_2 = x86._mm_xor_si128(s_2, stk_2) + s_3 = x86._mm_xor_si128(s_3, stk_3) + + for i in 1 ..= BC_ROUNDS { + sk = intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[i])) + + tk1_0 = h_(tk1_0) + tk1_1 = h_(tk1_1) + tk1_2 = h_(tk1_2) + tk1_3 = h_(tk1_3) + + stk_0 = x86._mm_xor_si128(tk1_0, sk) + stk_1 = x86._mm_xor_si128(tk1_1, sk) + stk_2 = x86._mm_xor_si128(tk1_2, sk) + stk_3 = x86._mm_xor_si128(tk1_3, sk) + + s_0 = x86._mm_aesenc_si128(s_0, stk_0) + s_1 = x86._mm_aesenc_si128(s_1, stk_1) + s_2 = x86._mm_aesenc_si128(s_2, stk_2) + s_3 = x86._mm_aesenc_si128(s_3, stk_3) + } + + return s_0, s_1, s_2, s_3 +} + +@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results) +bc_x1 :: #force_inline proc "contextless" ( + ctx: ^Context, + s: x86.__m128i, + tweak: x86.__m128i, +) -> x86.__m128i #no_bounds_check { + s, tk1 := s, tweak + + sk := intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[0])) + stk := x86._mm_xor_si128(tk1, sk) + + s = x86._mm_xor_si128(s, stk) + + for i in 1 ..= BC_ROUNDS { + sk = intrinsics.unaligned_load((^x86.__m128i)(&ctx._subkeys[i])) + + tk1 = h_(tk1) + + stk = x86._mm_xor_si128(tk1, sk) + + s = x86._mm_aesenc_si128(s, stk) + } + + return s +} + +@(private = "file", enable_target_feature = "sse2,ssse3,sse4.1,aes", require_results) +bc_absorb :: proc "contextless" ( + ctx: ^Context, + tag: x86.__m128i, + src: []byte, + tweak_prefix: x86.__m128i, + stk_block_nr: int, +) -> (x86.__m128i, int) #no_bounds_check { + src, stk_block_nr, tag := src, stk_block_nr, tag + + nr_blocks := len(src) / BLOCK_SIZE + for nr_blocks >= 4 { + d_0, d_1, d_2, d_3 := bc_x4( + ctx, + intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))), + intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[BLOCK_SIZE:]))), + intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[2*BLOCK_SIZE:]))), + intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[3*BLOCK_SIZE:]))), + auth_tweak(tweak_prefix, stk_block_nr), + auth_tweak(tweak_prefix, stk_block_nr + 1), + auth_tweak(tweak_prefix, stk_block_nr + 2), + auth_tweak(tweak_prefix, stk_block_nr + 3), + ) + + tag = x86._mm_xor_si128(tag, d_0) + tag = x86._mm_xor_si128(tag, d_1) + tag = x86._mm_xor_si128(tag, d_2) + tag = x86._mm_xor_si128(tag, d_3) + + src = src[4*BLOCK_SIZE:] + stk_block_nr += 4 + nr_blocks -= 4 + } + + for nr_blocks > 0 { + d := bc_x1( + ctx, + intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))), + auth_tweak(tweak_prefix, stk_block_nr), + ) + + tag = x86._mm_xor_si128(tag, d) + + src = src[BLOCK_SIZE:] + stk_block_nr += 1 + nr_blocks -= 1 + } + + return tag, stk_block_nr +} + +@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results) +bc_final :: proc "contextless" ( + ctx: ^Context, + tag: x86.__m128i, + iv: []byte, +) -> x86.__m128i { + tmp: [BLOCK_SIZE]byte + + tmp[0] = PREFIX_TAG << PREFIX_SHIFT + copy(tmp[1:], iv) + + tweak := intrinsics.unaligned_load((^x86.__m128i)(&tmp)) + + return bc_x1(ctx, tag, tweak) +} + +@(private = "file", enable_target_feature = "sse2,ssse3,aes", require_results) +bc_encrypt :: proc "contextless" ( + ctx: ^Context, + dst: []byte, + src: []byte, + iv: x86.__m128i, + tweak_tag: x86.__m128i, + stk_block_nr: int, +) -> int { + dst, src, stk_block_nr := dst, src, stk_block_nr + + nr_blocks := len(src) / BLOCK_SIZE + for nr_blocks >= 4 { + d_0, d_1, d_2, d_3 := bc_x4( + ctx, + iv, iv, iv, iv, + enc_tweak(tweak_tag, stk_block_nr), + enc_tweak(tweak_tag, stk_block_nr + 1), + enc_tweak(tweak_tag, stk_block_nr + 2), + enc_tweak(tweak_tag, stk_block_nr + 3), + ) + + intrinsics.unaligned_store( + (^x86.__m128i)(raw_data(dst)), + x86._mm_xor_si128( + d_0, + intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))), + ), + ) + intrinsics.unaligned_store( + (^x86.__m128i)(raw_data(dst[BLOCK_SIZE:])), + x86._mm_xor_si128( + d_1, + intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[BLOCK_SIZE:]))), + ), + ) + intrinsics.unaligned_store( + (^x86.__m128i)(raw_data(dst[2*BLOCK_SIZE:])), + x86._mm_xor_si128( + d_2, + intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[2*BLOCK_SIZE:]))), + ), + ) + intrinsics.unaligned_store( + (^x86.__m128i)(raw_data(dst[3*BLOCK_SIZE:])), + x86._mm_xor_si128( + d_3, + intrinsics.unaligned_load((^x86.__m128i)(raw_data(src[3*BLOCK_SIZE:]))), + ), + ) + + src, dst = src[4*BLOCK_SIZE:], dst[4*BLOCK_SIZE:] + stk_block_nr += 4 + nr_blocks -= 4 + } + + for nr_blocks > 0 { + d := bc_x1( + ctx, + iv, + enc_tweak(tweak_tag, stk_block_nr), + ) + + intrinsics.unaligned_store( + (^x86.__m128i)(raw_data(dst)), + x86._mm_xor_si128( + d, + intrinsics.unaligned_load((^x86.__m128i)(raw_data(src))), + ), + ) + + src, dst = src[BLOCK_SIZE:], dst[BLOCK_SIZE:] + stk_block_nr += 1 + nr_blocks -= 1 + } + + return stk_block_nr +} + +@(private) +e_hw :: proc "contextless" (ctx: ^Context, dst, tag, iv, aad, plaintext: []byte) #no_bounds_check { + tmp: [BLOCK_SIZE]byte + copy(tmp[1:], iv) + iv_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(&tmp))) + + // Algorithm 3 + // + // Associated data + // A_1 || ... || A_la || A_∗ <- A where each |A_i| = n and |A_∗| < n + // Auth <- 0^n + // for i = 0 to la − 1 do + // Auth <- Auth ^ EK(0010 || i, A_i+1) + // end + // if A_∗ != nil then + // Auth <- Auth ^ EK(0110 || la, pad10∗(A_∗)) + // end + auth: x86.__m128i + n: int + + aad := aad + auth, n = bc_absorb(ctx, auth, aad, _PREFIX_AD_BLOCK, 0) + aad = aad[n*BLOCK_SIZE:] + if l := len(aad); l > 0 { + a_star: [BLOCK_SIZE]byte + + copy(a_star[:], aad) + a_star[l] = 0x80 + + auth, _ = bc_absorb(ctx, auth, a_star[:], _PREFIX_AD_FINAL, n) + } + + // Message authentication and tag generation + // M_1 || ... || M_l || M_∗ <- M where each |M_j| = n and |M_∗| < n + // tag <- Auth + // for j = 0 to l − 1 do + // tag <- tag ^ EK(0000 || j, M_j+1) + // end + // if M_∗ != nil then + // tag <- tag ^ EK(0100 || l, pad10∗(M_∗)) + // end + // tag <- EK(0001 || 0^4 ||N, tag) + m := plaintext + auth, n = bc_absorb(ctx, auth, m, _PREFIX_MSG_BLOCK, 0) + m = m[n*BLOCK_SIZE:] + if l := len(m); l > 0 { + m_star: [BLOCK_SIZE]byte + + copy(m_star[:], m) + m_star[l] = 0x80 + + auth, _ = bc_absorb(ctx, auth, m_star[:], _PREFIX_MSG_FINAL, n) + } + auth = bc_final(ctx, auth, iv) + + // Message encryption + // for j = 0 to l − 1 do + // C_j <- M_j ^ EK(1 || tag ^ j, 0^8 || N) + // end + // if M_∗ != nil then + // C_∗ <- M_* ^ EK(1 || tag ^ l, 0^8 || N) + // end + // + // return (C_1 || ... || C_l || C_∗, tag) + m = plaintext + n = bc_encrypt(ctx, dst, m, iv_, auth, 0) + m = m[n*BLOCK_SIZE:] + if l := len(m); l > 0 { + m_star: [BLOCK_SIZE]byte + + copy(m_star[:], m) + _ = bc_encrypt(ctx, m_star[:], m_star[:], iv_, auth, n) + + copy(dst[n*BLOCK_SIZE:], m_star[:]) + } + + intrinsics.unaligned_store((^x86.__m128i)(raw_data(tag)), auth) +} + +@(private, require_results) +d_hw :: proc "contextless" (ctx: ^Context, dst, iv, aad, ciphertext, tag: []byte) -> bool { + tmp: [BLOCK_SIZE]byte + copy(tmp[1:], iv) + iv_ := intrinsics.unaligned_load((^x86.__m128i)(raw_data(&tmp))) + + // Algorithm 4 + // + // Message decryption + // C_1 || ... || C_l || C_∗ <- C where each |C_j| = n and |C_∗| < n + // for j = 0 to l − 1 do + // M_j <- C_j ^ EK(1 || tag ^ j, 0^8 || N) + // end + // if C_∗ != nil then + // M_∗ <- C_∗ ^ EK(1 || tag ^ l, 0^8 || N) + // end + auth := intrinsics.unaligned_load((^x86.__m128i)(raw_data(tag))) + + m := ciphertext + n := bc_encrypt(ctx, dst, m, iv_, auth, 0) + m = m[n*BLOCK_SIZE:] + if l := len(m); l > 0 { + m_star: [BLOCK_SIZE]byte + + copy(m_star[:], m) + _ = bc_encrypt(ctx, m_star[:], m_star[:], iv_, auth, n) + + copy(dst[n*BLOCK_SIZE:], m_star[:]) + + mem.zero_explicit(&m_star, size_of(m_star)) + } + + // Associated data + // A_1 || ... || Al_a || A_∗ <- A where each |Ai_| = n and |A_∗| < n + // Auth <- 0 + // for i = 0 to la − 1 do + // Auth <- Auth ^ EK(0010 || i, A_i+1) + // end + // if A∗ != nil then + // Auth <- Auth ^ EK(0110| | l_a, pad10∗(A_∗)) + // end + auth = x86.__m128i{0, 0} + aad := aad + auth, n = bc_absorb(ctx, auth, aad, _PREFIX_AD_BLOCK, 0) + aad = aad[BLOCK_SIZE*n:] + if l := len(aad); l > 0 { + a_star: [BLOCK_SIZE]byte + + copy(a_star[:], aad) + a_star[l] = 0x80 + + auth, _ = bc_absorb(ctx, auth, a_star[:], _PREFIX_AD_FINAL, n) + } + + // Message authentication and tag generation + // M_1 || ... || M_l || M_∗ <- M where each |M_j| = n and |M_∗| < n + // tag0 <- Auth + // for j = 0 to l − 1 do + // tag0 <- tag0 ^ EK(0000 || j, M_j+1) + // end + // if M_∗ != nil then + // tag0 <- tag0 ^ EK(0100 || l, pad10∗(M_∗)) + // end + // tag0 <- EK(0001 || 0^4 || N, tag0) + m = dst[:len(ciphertext)] + auth, n = bc_absorb(ctx, auth, m, _PREFIX_MSG_BLOCK, 0) + m = m[n*BLOCK_SIZE:] + if l := len(m); l > 0 { + m_star: [BLOCK_SIZE]byte + + copy(m_star[:], m) + m_star[l] = 0x80 + + auth, _ = bc_absorb(ctx, auth, m_star[:], _PREFIX_MSG_FINAL, n) + } + auth = bc_final(ctx, auth, iv) + + // Tag verification + // if tag0 = tag then return (M_1 || ... || M_l || M_∗) + // else return false + intrinsics.unaligned_store((^x86.__m128i)(raw_data(&tmp)), auth) + ok := crypto.compare_constant_time(tmp[:], tag) == 1 + + mem.zero_explicit(&tmp, size_of(tmp)) + + return ok +} diff --git a/core/crypto/ed25519/ed25519.odin b/core/crypto/ed25519/ed25519.odin index 460a19563..deeb80685 100644 --- a/core/crypto/ed25519/ed25519.odin +++ b/core/crypto/ed25519/ed25519.odin @@ -81,12 +81,8 @@ private_key_set_bytes :: proc(priv_key: ^Private_Key, b: []byte) -> bool { // private_key_bytes sets dst to byte-encoding of priv_key. private_key_bytes :: proc(priv_key: ^Private_Key, dst: []byte) { - if !priv_key._is_initialized { - panic("crypto/ed25519: uninitialized private key") - } - if len(dst) != PRIVATE_KEY_SIZE { - panic("crypto/ed25519: invalid destination size") - } + ensure(priv_key._is_initialized, "crypto/ed25519: uninitialized private key") + ensure(len(dst) == PRIVATE_KEY_SIZE, "crypto/ed25519: invalid destination size") copy(dst, priv_key._b[:]) } @@ -98,12 +94,8 @@ private_key_clear :: proc "contextless" (priv_key: ^Private_Key) { // sign writes the signature by priv_key over msg to sig. sign :: proc(priv_key: ^Private_Key, msg, sig: []byte) { - if !priv_key._is_initialized { - panic("crypto/ed25519: uninitialized private key") - } - if len(sig) != SIGNATURE_SIZE { - panic("crypto/ed25519: invalid destination size") - } + ensure(priv_key._is_initialized, "crypto/ed25519: uninitialized private key") + ensure(len(sig) == SIGNATURE_SIZE, "crypto/ed25519: invalid destination size") // 1. Compute the hash of the private key d, H(d) = (h_0, h_1, ..., h_2b-1) // using SHA-512 for Ed25519. H(d) may be precomputed. @@ -178,9 +170,7 @@ public_key_set_bytes :: proc "contextless" (pub_key: ^Public_Key, b: []byte) -> // public_key_set_priv sets pub_key to the public component of priv_key. public_key_set_priv :: proc(pub_key: ^Public_Key, priv_key: ^Private_Key) { - if !priv_key._is_initialized { - panic("crypto/ed25519: uninitialized public key") - } + ensure(priv_key._is_initialized, "crypto/ed25519: uninitialized public key") src := &priv_key._pub_key copy(pub_key._b[:], src._b[:]) @@ -191,21 +181,15 @@ public_key_set_priv :: proc(pub_key: ^Public_Key, priv_key: ^Private_Key) { // public_key_bytes sets dst to byte-encoding of pub_key. public_key_bytes :: proc(pub_key: ^Public_Key, dst: []byte) { - if !pub_key._is_initialized { - panic("crypto/ed25519: uninitialized public key") - } - if len(dst) != PUBLIC_KEY_SIZE { - panic("crypto/ed25519: invalid destination size") - } + ensure(pub_key._is_initialized, "crypto/ed25519: uninitialized public key") + ensure(len(dst) == PUBLIC_KEY_SIZE, "crypto/ed25519: invalid destination size") copy(dst, pub_key._b[:]) } // public_key_equal returns true iff pub_key is equal to other. public_key_equal :: proc(pub_key, other: ^Public_Key) -> bool { - if !pub_key._is_initialized || !other._is_initialized { - panic("crypto/ed25519: uninitialized public key") - } + ensure(pub_key._is_initialized && other._is_initialized, "crypto/ed25519: uninitialized public key") return crypto.compare_constant_time(pub_key._b[:], other._b[:]) == 1 } diff --git a/core/crypto/hmac/hmac.odin b/core/crypto/hmac/hmac.odin index 4813a9938..f74d6492f 100644 --- a/core/crypto/hmac/hmac.odin +++ b/core/crypto/hmac/hmac.odin @@ -56,7 +56,7 @@ init :: proc(ctx: ^Context, algorithm: hash.Algorithm, key: []byte) { // update adds more data to the Context. update :: proc(ctx: ^Context, data: []byte) { - assert(ctx._is_initialized) + ensure(ctx._is_initialized) hash.update(&ctx._i_hash, data) } @@ -64,13 +64,10 @@ update :: proc(ctx: ^Context, data: []byte) { // final finalizes the Context, writes the tag to dst, and calls // reset on the Context. final :: proc(ctx: ^Context, dst: []byte) { - assert(ctx._is_initialized) - defer (reset(ctx)) - if len(dst) != ctx._tag_sz { - panic("crypto/hmac: invalid destination tag size") - } + ensure(ctx._is_initialized) + ensure(len(dst) == ctx._tag_sz, "crypto/hmac: invalid destination tag size") hash.final(&ctx._i_hash, dst) // H((k ^ ipad) || text) @@ -105,14 +102,14 @@ reset :: proc(ctx: ^Context) { // algorithm returns the Algorithm used by a Context instance. algorithm :: proc(ctx: ^Context) -> hash.Algorithm { - assert(ctx._is_initialized) + ensure(ctx._is_initialized) return hash.algorithm(&ctx._i_hash) } // tag_size returns the tag size of a Context instance in bytes. tag_size :: proc(ctx: ^Context) -> int { - assert(ctx._is_initialized) + ensure(ctx._is_initialized) return ctx._tag_sz } diff --git a/core/crypto/kmac/kmac.odin b/core/crypto/kmac/kmac.odin index e8bf42946..6f58e20a7 100644 --- a/core/crypto/kmac/kmac.odin +++ b/core/crypto/kmac/kmac.odin @@ -36,6 +36,7 @@ sum :: proc(sec_strength: int, dst, msg, key, domain_sep: []byte) { // tag is valid. verify :: proc(sec_strength: int, tag, msg, key, domain_sep: []byte, allocator := context.temp_allocator) -> bool { derived_tag := make([]byte, len(tag), allocator) + defer(delete(derived_tag)) sum(sec_strength, derived_tag, msg, key, domain_sep) @@ -59,8 +60,6 @@ init_256 :: proc(ctx: ^Context, key, domain_sep: []byte) { // update adds more data to the Context. update :: proc(ctx: ^Context, data: []byte) { - assert(ctx.is_initialized) - shake.write((^shake.Context)(ctx), data) } @@ -68,12 +67,9 @@ update :: proc(ctx: ^Context, data: []byte) { // on the Context. This routine will panic if the dst length is less than // MIN_TAG_SIZE. final :: proc(ctx: ^Context, dst: []byte) { - assert(ctx.is_initialized) defer reset(ctx) - if len(dst) < MIN_TAG_SIZE { - panic("crypto/kmac: invalid KMAC tag_size, too short") - } + ensure(len(dst) >= MIN_TAG_SIZE, "crypto/kmac: invalid KMAC tag_size, too short") _sha3.final_cshake((^_sha3.Context)(ctx), dst) } @@ -103,14 +99,12 @@ _init_kmac :: proc(ctx: ^Context, key, s: []byte, sec_strength: int) { reset(ctx) } - if len(key) < sec_strength / 8 { - panic("crypto/kmac: invalid KMAC key, too short") - } + ensure(len(key) >= sec_strength / 8, "crypto/kmac: invalid KMAC key, too short") ctx_ := (^_sha3.Context)(ctx) _sha3.init_cshake(ctx_, N_KMAC, s, sec_strength) _sha3.bytepad(ctx_, [][]byte{key}, _sha3.rate_cshake(sec_strength)) } -@(private) +@(private, rodata) N_KMAC := []byte{'K', 'M', 'A', 'C'} diff --git a/core/crypto/legacy/keccak/keccak.odin b/core/crypto/legacy/keccak/keccak.odin index 6ca66b7ca..40fc2729f 100644 --- a/core/crypto/legacy/keccak/keccak.odin +++ b/core/crypto/legacy/keccak/keccak.odin @@ -40,37 +40,37 @@ BLOCK_SIZE_512 :: _sha3.RATE_512 Context :: distinct _sha3.Context // init_224 initializes a Context for Keccak-224. -init_224 :: proc(ctx: ^Context) { +init_224 :: proc "contextless" (ctx: ^Context) { ctx.mdlen = DIGEST_SIZE_224 _init(ctx) } // init_256 initializes a Context for Keccak-256. -init_256 :: proc(ctx: ^Context) { +init_256 :: proc "contextless" (ctx: ^Context) { ctx.mdlen = DIGEST_SIZE_256 _init(ctx) } // init_384 initializes a Context for Keccak-384. -init_384 :: proc(ctx: ^Context) { +init_384 :: proc "contextless" (ctx: ^Context) { ctx.mdlen = DIGEST_SIZE_384 _init(ctx) } // init_512 initializes a Context for Keccak-512. -init_512 :: proc(ctx: ^Context) { +init_512 :: proc "contextless" (ctx: ^Context) { ctx.mdlen = DIGEST_SIZE_512 _init(ctx) } @(private) -_init :: proc(ctx: ^Context) { +_init :: proc "contextless" (ctx: ^Context) { ctx.dsbyte = _sha3.DS_KECCAK _sha3.init((^_sha3.Context)(ctx)) } // update adds more data to the Context. -update :: proc(ctx: ^Context, data: []byte) { +update :: proc "contextless" (ctx: ^Context, data: []byte) { _sha3.update((^_sha3.Context)(ctx), data) } @@ -79,17 +79,17 @@ update :: proc(ctx: ^Context, data: []byte) { // // Iff finalize_clone is set, final will work on a copy of the Context, // which is useful for for calculating rolling digests. -final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) { +final :: proc "contextless" (ctx: ^Context, hash: []byte, finalize_clone: bool = false) { _sha3.final((^_sha3.Context)(ctx), hash, finalize_clone) } // clone clones the Context other into ctx. -clone :: proc(ctx, other: ^Context) { +clone :: proc "contextless" (ctx, other: ^Context) { _sha3.clone((^_sha3.Context)(ctx), (^_sha3.Context)(other)) } // reset sanitizes the Context. The Context must be re-initialized to // be used again. -reset :: proc(ctx: ^Context) { +reset :: proc "contextless" (ctx: ^Context) { _sha3.reset((^_sha3.Context)(ctx)) } diff --git a/core/crypto/legacy/md5/md5.odin b/core/crypto/legacy/md5/md5.odin index 28b47e0b3..050501d98 100644 --- a/core/crypto/legacy/md5/md5.odin +++ b/core/crypto/legacy/md5/md5.odin @@ -53,7 +53,7 @@ init :: proc(ctx: ^Context) { // update adds more data to the Context. update :: proc(ctx: ^Context, data: []byte) { - assert(ctx.is_initialized) + ensure(ctx.is_initialized) for i := 0; i < len(data); i += 1 { ctx.data[ctx.datalen] = data[i] @@ -72,11 +72,8 @@ update :: proc(ctx: ^Context, data: []byte) { // Iff finalize_clone is set, final will work on a copy of the Context, // which is useful for for calculating rolling digests. final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) { - assert(ctx.is_initialized) - - if len(hash) < DIGEST_SIZE { - panic("crypto/md5: invalid destination digest size") - } + ensure(ctx.is_initialized) + ensure(len(hash) >= DIGEST_SIZE, "crypto/md5: invalid destination digest size") ctx := ctx if finalize_clone { diff --git a/core/crypto/legacy/sha1/sha1.odin b/core/crypto/legacy/sha1/sha1.odin index 1025ecb5b..5a2b57005 100644 --- a/core/crypto/legacy/sha1/sha1.odin +++ b/core/crypto/legacy/sha1/sha1.odin @@ -60,7 +60,7 @@ init :: proc(ctx: ^Context) { // update adds more data to the Context. update :: proc(ctx: ^Context, data: []byte) { - assert(ctx.is_initialized) + ensure(ctx.is_initialized) for i := 0; i < len(data); i += 1 { ctx.data[ctx.datalen] = data[i] @@ -79,11 +79,8 @@ update :: proc(ctx: ^Context, data: []byte) { // Iff finalize_clone is set, final will work on a copy of the Context, // which is useful for for calculating rolling digests. final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) { - assert(ctx.is_initialized) - - if len(hash) < DIGEST_SIZE { - panic("crypto/sha1: invalid destination digest size") - } + ensure(ctx.is_initialized) + ensure(len(hash) >= DIGEST_SIZE, "crypto/sha1: invalid destination digest size") ctx := ctx if finalize_clone { diff --git a/core/crypto/poly1305/poly1305.odin b/core/crypto/poly1305/poly1305.odin index ea0e6c907..3dd915da7 100644 --- a/core/crypto/poly1305/poly1305.odin +++ b/core/crypto/poly1305/poly1305.odin @@ -60,9 +60,7 @@ Context :: struct { // init initializes a Context with the specified key. The key SHOULD be // unique and MUST be unpredictable for each invocation. init :: proc(ctx: ^Context, key: []byte) { - if len(key) != KEY_SIZE { - panic("crypto/poly1305: invalid key size") - } + ensure(len(key) == KEY_SIZE, "crypto/poly1305: invalid key size") // r = le_bytes_to_num(key[0..15]) // r = clamp(r) (r &= 0xffffffc0ffffffc0ffffffc0fffffff) @@ -85,7 +83,7 @@ init :: proc(ctx: ^Context, key: []byte) { // update adds more data to the Context. update :: proc(ctx: ^Context, data: []byte) { - assert(ctx._is_initialized) + ensure(ctx._is_initialized) msg := data msg_len := len(data) @@ -124,12 +122,10 @@ update :: proc(ctx: ^Context, data: []byte) { // final finalizes the Context, writes the tag to dst, and calls // reset on the Context. final :: proc(ctx: ^Context, dst: []byte) { - assert(ctx._is_initialized) defer reset(ctx) - if len(dst) != TAG_SIZE { - panic("poly1305: invalid destination tag size") - } + ensure(ctx._is_initialized) + ensure(len(dst) == TAG_SIZE, "poly1305: invalid destination tag size") // Process remaining block if ctx._leftover > 0 { diff --git a/core/crypto/ristretto255/ristretto255.odin b/core/crypto/ristretto255/ristretto255.odin index 7b0944e33..20a002900 100644 --- a/core/crypto/ristretto255/ristretto255.odin +++ b/core/crypto/ristretto255/ristretto255.odin @@ -16,7 +16,7 @@ ELEMENT_SIZE :: 32 // group element. WIDE_ELEMENT_SIZE :: 64 -@(private) +@(private, rodata) FE_NEG_ONE := field.Tight_Field_Element { 2251799813685228, 2251799813685247, @@ -24,7 +24,7 @@ FE_NEG_ONE := field.Tight_Field_Element { 2251799813685247, 2251799813685247, } -@(private) +@(private, rodata) FE_INVSQRT_A_MINUS_D := field.Tight_Field_Element { 278908739862762, 821645201101625, @@ -32,7 +32,7 @@ FE_INVSQRT_A_MINUS_D := field.Tight_Field_Element { 1777959178193151, 2118520810568447, } -@(private) +@(private, rodata) FE_ONE_MINUS_D_SQ := field.Tight_Field_Element { 1136626929484150, 1998550399581263, @@ -40,7 +40,7 @@ FE_ONE_MINUS_D_SQ := field.Tight_Field_Element { 118527312129759, 45110755273534, } -@(private) +@(private, rodata) FE_D_MINUS_ONE_SQUARED := field.Tight_Field_Element { 1507062230895904, 1572317787530805, @@ -48,7 +48,7 @@ FE_D_MINUS_ONE_SQUARED := field.Tight_Field_Element { 317374165784489, 1572899562415810, } -@(private) +@(private, rodata) FE_SQRT_AD_MINUS_ONE := field.Tight_Field_Element { 2241493124984347, 425987919032274, @@ -76,7 +76,7 @@ ge_clear :: proc "contextless" (ge: ^Group_Element) { // ge_set sets `ge = a`. ge_set :: proc(ge, a: ^Group_Element) { - _ge_assert_initialized([]^Group_Element{a}) + _ge_ensure_initialized([]^Group_Element{a}) grp.ge_set(&ge._p, &a._p) ge._is_initialized = true @@ -199,9 +199,7 @@ ge_set_bytes :: proc "contextless" (ge: ^Group_Element, b: []byte) -> bool { // ge_set_wide_bytes sets ge to the result of deriving a ristretto255 // group element, from a wide (512-bit) byte string. ge_set_wide_bytes :: proc(ge: ^Group_Element, b: []byte) { - if len(b) != WIDE_ELEMENT_SIZE { - panic("crypto/ristretto255: invalid wide input size") - } + ensure(len(b) == WIDE_ELEMENT_SIZE, "crypto/ristretto255: invalid wide input size") // The element derivation function on an input string b proceeds as // follows: @@ -222,10 +220,8 @@ ge_set_wide_bytes :: proc(ge: ^Group_Element, b: []byte) { // ge_bytes sets dst to the canonical encoding of ge. ge_bytes :: proc(ge: ^Group_Element, dst: []byte) { - _ge_assert_initialized([]^Group_Element{ge}) - if len(dst) != ELEMENT_SIZE { - panic("crypto/ristretto255: invalid destination size") - } + _ge_ensure_initialized([]^Group_Element{ge}) + ensure(len(dst) == ELEMENT_SIZE, "crypto/ristretto255: invalid destination size") x0, y0, z0, t0 := &ge._p.x, &ge._p.y, &ge._p.z, &ge._p.t @@ -306,7 +302,7 @@ ge_bytes :: proc(ge: ^Group_Element, dst: []byte) { // ge_add sets `ge = a + b`. ge_add :: proc(ge, a, b: ^Group_Element) { - _ge_assert_initialized([]^Group_Element{a, b}) + _ge_ensure_initialized([]^Group_Element{a, b}) grp.ge_add(&ge._p, &a._p, &b._p) ge._is_initialized = true @@ -314,7 +310,7 @@ ge_add :: proc(ge, a, b: ^Group_Element) { // ge_double sets `ge = a + a`. ge_double :: proc(ge, a: ^Group_Element) { - _ge_assert_initialized([]^Group_Element{a}) + _ge_ensure_initialized([]^Group_Element{a}) grp.ge_double(&ge._p, &a._p) ge._is_initialized = true @@ -322,7 +318,7 @@ ge_double :: proc(ge, a: ^Group_Element) { // ge_negate sets `ge = -a`. ge_negate :: proc(ge, a: ^Group_Element) { - _ge_assert_initialized([]^Group_Element{a}) + _ge_ensure_initialized([]^Group_Element{a}) grp.ge_negate(&ge._p, &a._p) ge._is_initialized = true @@ -330,7 +326,7 @@ ge_negate :: proc(ge, a: ^Group_Element) { // ge_scalarmult sets `ge = A * sc`. ge_scalarmult :: proc(ge, A: ^Group_Element, sc: ^Scalar) { - _ge_assert_initialized([]^Group_Element{A}) + _ge_ensure_initialized([]^Group_Element{A}) grp.ge_scalarmult(&ge._p, &A._p, sc) ge._is_initialized = true @@ -344,7 +340,7 @@ ge_scalarmult_generator :: proc "contextless" (ge: ^Group_Element, sc: ^Scalar) // ge_scalarmult_vartime sets `ge = A * sc` in variable time. ge_scalarmult_vartime :: proc(ge, A: ^Group_Element, sc: ^Scalar) { - _ge_assert_initialized([]^Group_Element{A}) + _ge_ensure_initialized([]^Group_Element{A}) grp.ge_scalarmult_vartime(&ge._p, &A._p, sc) ge._is_initialized = true @@ -358,7 +354,7 @@ ge_double_scalarmult_generator_vartime :: proc( A: ^Group_Element, b: ^Scalar, ) { - _ge_assert_initialized([]^Group_Element{A}) + _ge_ensure_initialized([]^Group_Element{A}) grp.ge_double_scalarmult_basepoint_vartime(&ge._p, a, &A._p, b) ge._is_initialized = true @@ -367,7 +363,7 @@ ge_double_scalarmult_generator_vartime :: proc( // ge_cond_negate sets `ge = a` iff `ctrl == 0` and `ge = -a` iff `ctrl == 1`. // Behavior for all other values of ctrl are undefined, ge_cond_negate :: proc(ge, a: ^Group_Element, ctrl: int) { - _ge_assert_initialized([]^Group_Element{a}) + _ge_ensure_initialized([]^Group_Element{a}) grp.ge_cond_negate(&ge._p, &a._p, ctrl) ge._is_initialized = true @@ -376,7 +372,7 @@ ge_cond_negate :: proc(ge, a: ^Group_Element, ctrl: int) { // ge_cond_assign sets `ge = ge` iff `ctrl == 0` and `ge = a` iff `ctrl == 1`. // Behavior for all other values of ctrl are undefined, ge_cond_assign :: proc(ge, a: ^Group_Element, ctrl: int) { - _ge_assert_initialized([]^Group_Element{ge, a}) + _ge_ensure_initialized([]^Group_Element{ge, a}) grp.ge_cond_assign(&ge._p, &a._p, ctrl) } @@ -384,7 +380,7 @@ ge_cond_assign :: proc(ge, a: ^Group_Element, ctrl: int) { // ge_cond_select sets `ge = a` iff `ctrl == 0` and `ge = b` iff `ctrl == 1`. // Behavior for all other values of ctrl are undefined, ge_cond_select :: proc(ge, a, b: ^Group_Element, ctrl: int) { - _ge_assert_initialized([]^Group_Element{a, b}) + _ge_ensure_initialized([]^Group_Element{a, b}) grp.ge_cond_select(&ge._p, &a._p, &b._p, ctrl) ge._is_initialized = true @@ -393,7 +389,7 @@ ge_cond_select :: proc(ge, a, b: ^Group_Element, ctrl: int) { // ge_equal returns 1 iff `a == b`, and 0 otherwise. @(require_results) ge_equal :: proc(a, b: ^Group_Element) -> int { - _ge_assert_initialized([]^Group_Element{a, b}) + _ge_ensure_initialized([]^Group_Element{a, b}) // CT_EQ(x1 * y2, y1 * x2) | CT_EQ(y1 * y2, x1 * x2) ax_by, ay_bx, ay_by, ax_bx: field.Tight_Field_Element = ---, ---, ---, --- @@ -501,10 +497,8 @@ ge_map :: proc "contextless" (ge: ^Group_Element, b: []byte) { } @(private) -_ge_assert_initialized :: proc(ges: []^Group_Element) { +_ge_ensure_initialized :: proc(ges: []^Group_Element) { for ge in ges { - if !ge._is_initialized { - panic("crypto/ristretto255: uninitialized group element") - } + ensure(ge._is_initialized, "crypto/ristretto255: uninitialized group element") } } diff --git a/core/crypto/ristretto255/ristretto255_scalar.odin b/core/crypto/ristretto255/ristretto255_scalar.odin index 1ecb490e0..75844b3f4 100644 --- a/core/crypto/ristretto255/ristretto255_scalar.odin +++ b/core/crypto/ristretto255/ristretto255_scalar.odin @@ -42,9 +42,7 @@ sc_set_bytes :: proc(sc: ^Scalar, b: []byte) -> bool { // scalar, from a wide (512-bit) byte string by interpreting b as a // little-endian value, and reducing it mod the group order. sc_set_bytes_wide :: proc(sc: ^Scalar, b: []byte) { - if len(b) != WIDE_SCALAR_SIZE { - panic("crypto/ristretto255: invalid wide input size") - } + ensure(len(b) == WIDE_SCALAR_SIZE, "crypto/ristretto255: invalid wide input size") b_ := (^[WIDE_SCALAR_SIZE]byte)(raw_data(b)) grp.sc_set_bytes_wide(sc, b_) @@ -52,9 +50,7 @@ sc_set_bytes_wide :: proc(sc: ^Scalar, b: []byte) { // sc_bytes sets dst to the canonical encoding of sc. sc_bytes :: proc(sc: ^Scalar, dst: []byte) { - if len(dst) != SCALAR_SIZE { - panic("crypto/ristretto255: invalid destination size") - } + ensure(len(dst) == SCALAR_SIZE, "crypto/ristretto255: invalid destination size") grp.sc_bytes(dst, sc) } diff --git a/core/crypto/sha2/sha2.odin b/core/crypto/sha2/sha2.odin index 4230851ab..bf9b81601 100644 --- a/core/crypto/sha2/sha2.odin +++ b/core/crypto/sha2/sha2.odin @@ -15,9 +15,9 @@ package sha2 zhibog, dotbmp: Initial implementation. */ -import "core:encoding/endian" +@(require) import "core:encoding/endian" import "core:math/bits" -import "core:mem" +@(require) import "core:mem" // DIGEST_SIZE_224 is the SHA-224 digest size in bytes. DIGEST_SIZE_224 :: 28 @@ -158,7 +158,7 @@ _init :: proc(ctx: ^$T) { // update adds more data to the Context. update :: proc(ctx: ^$T, data: []byte) { - assert(ctx.is_initialized) + ensure(ctx.is_initialized) when T == Context_256 { CURR_BLOCK_SIZE :: BLOCK_SIZE_256 @@ -194,11 +194,8 @@ update :: proc(ctx: ^$T, data: []byte) { // Iff finalize_clone is set, final will work on a copy of the Context, // which is useful for for calculating rolling digests. final :: proc(ctx: ^$T, hash: []byte, finalize_clone: bool = false) { - assert(ctx.is_initialized) - - if len(hash) * 8 < ctx.md_bits { - panic("crypto/sha2: invalid destination digest size") - } + ensure(ctx.is_initialized) + ensure(len(hash) * 8 >= ctx.md_bits, "crypto/sha2: invalid destination digest size") ctx := ctx if finalize_clone { @@ -238,7 +235,7 @@ final :: proc(ctx: ^$T, hash: []byte, finalize_clone: bool = false) { endian.unchecked_put_u64be(pad[8:], length_lo) update(ctx, pad[0:16]) } - assert(ctx.bitlength == 0) + assert(ctx.bitlength == 0) // Check for bugs when T == Context_256 { for i := 0; i < ctx.md_bits / 32; i += 1 { @@ -270,8 +267,8 @@ reset :: proc(ctx: ^$T) { SHA2 implementation */ -@(private) -sha256_k := [64]u32 { +@(private, rodata) +SHA256_K := [64]u32 { 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, @@ -290,8 +287,8 @@ sha256_k := [64]u32 { 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, } -@(private) -sha512_k := [80]u64 { +@(private, rodata) +SHA512_K := [80]u64 { 0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc, 0x3956c25bf348b538, 0x59f111f1b605d019, @@ -334,6 +331,11 @@ sha512_k := [80]u64 { 0x5fcb6fab3ad6faec, 0x6c44198c4a475817, } +@(private) +SHA256_ROUNDS :: 64 +@(private) +SHA512_ROUNDS :: 80 + @(private) SHA256_CH :: #force_inline proc "contextless" (x, y, z: u32) -> u32 { return (x & y) ~ (~x & z) @@ -395,22 +397,29 @@ SHA512_F4 :: #force_inline proc "contextless" (x: u64) -> u64 { } @(private) -sha2_transf :: proc "contextless" (ctx: ^$T, data: []byte) { +sha2_transf :: proc "contextless" (ctx: ^$T, data: []byte) #no_bounds_check { when T == Context_256 { - w: [64]u32 + if is_hardware_accelerated_256() { + sha256_transf_hw(ctx, data) + return + } + + w: [SHA256_ROUNDS]u32 wv: [8]u32 t1, t2: u32 + CURR_BLOCK_SIZE :: BLOCK_SIZE_256 } else when T == Context_512 { - w: [80]u64 + w: [SHA512_ROUNDS]u64 wv: [8]u64 t1, t2: u64 + CURR_BLOCK_SIZE :: BLOCK_SIZE_512 } data := data for len(data) >= CURR_BLOCK_SIZE { - for i := 0; i < 16; i += 1 { + for i in 0 ..< 16 { when T == Context_256 { w[i] = endian.unchecked_get_u32be(data[i * 4:]) } else when T == Context_512 { @@ -419,22 +428,22 @@ sha2_transf :: proc "contextless" (ctx: ^$T, data: []byte) { } when T == Context_256 { - for i := 16; i < 64; i += 1 { + for i in 16 ..< SHA256_ROUNDS { w[i] = SHA256_F4(w[i - 2]) + w[i - 7] + SHA256_F3(w[i - 15]) + w[i - 16] } } else when T == Context_512 { - for i := 16; i < 80; i += 1 { + for i in 16 ..< SHA512_ROUNDS { w[i] = SHA512_F4(w[i - 2]) + w[i - 7] + SHA512_F3(w[i - 15]) + w[i - 16] } } - for i := 0; i < 8; i += 1 { + for i in 0 ..< 8 { wv[i] = ctx.h[i] } when T == Context_256 { - for i := 0; i < 64; i += 1 { - t1 = wv[7] + SHA256_F2(wv[4]) + SHA256_CH(wv[4], wv[5], wv[6]) + sha256_k[i] + w[i] + for i in 0 ..< SHA256_ROUNDS { + t1 = wv[7] + SHA256_F2(wv[4]) + SHA256_CH(wv[4], wv[5], wv[6]) + SHA256_K[i] + w[i] t2 = SHA256_F1(wv[0]) + SHA256_MAJ(wv[0], wv[1], wv[2]) wv[7] = wv[6] wv[6] = wv[5] @@ -446,8 +455,8 @@ sha2_transf :: proc "contextless" (ctx: ^$T, data: []byte) { wv[0] = t1 + t2 } } else when T == Context_512 { - for i := 0; i < 80; i += 1 { - t1 = wv[7] + SHA512_F2(wv[4]) + SHA512_CH(wv[4], wv[5], wv[6]) + sha512_k[i] + w[i] + for i in 0 ..< SHA512_ROUNDS { + t1 = wv[7] + SHA512_F2(wv[4]) + SHA512_CH(wv[4], wv[5], wv[6]) + SHA512_K[i] + w[i] t2 = SHA512_F1(wv[0]) + SHA512_MAJ(wv[0], wv[1], wv[2]) wv[7] = wv[6] wv[6] = wv[5] @@ -460,7 +469,7 @@ sha2_transf :: proc "contextless" (ctx: ^$T, data: []byte) { } } - for i := 0; i < 8; i += 1 { + for i in 0 ..< 8 { ctx.h[i] += wv[i] } diff --git a/core/crypto/sha2/sha2_impl_hw_gen.odin b/core/crypto/sha2/sha2_impl_hw_gen.odin new file mode 100644 index 000000000..85c7f8b28 --- /dev/null +++ b/core/crypto/sha2/sha2_impl_hw_gen.odin @@ -0,0 +1,15 @@ +#+build !amd64 +package sha2 + +@(private = "file") +ERR_HW_NOT_SUPPORTED :: "crypto/sha2: hardware implementation unsupported" + +// is_hardware_accelerated_256 returns true iff hardware accelerated +// SHA-224/SHA-256 is supported. +is_hardware_accelerated_256 :: proc "contextless" () -> bool { + return false +} + +sha256_transf_hw :: proc "contextless" (ctx: ^Context_256, data: []byte) { + panic_contextless(ERR_HW_NOT_SUPPORTED) +} diff --git a/core/crypto/sha2/sha2_impl_hw_intel.odin b/core/crypto/sha2/sha2_impl_hw_intel.odin new file mode 100644 index 000000000..f16f353df --- /dev/null +++ b/core/crypto/sha2/sha2_impl_hw_intel.odin @@ -0,0 +1,260 @@ +#+build amd64 +package sha2 + +// Based on the public domain code by Jeffrey Walton, though +// realistically, there only is one sensible way to write this +// and Intel's whitepaper covers it. +// +// See: https://github.com/noloader/SHA-Intrinsics + +import "base:intrinsics" +import "core:simd" +import "core:simd/x86" +import "core:sys/info" + +@(private = "file") +MASK :: x86.__m128i{0x0405060700010203, 0x0c0d0e0f08090a0b} + +@(private = "file") +K_0 :: simd.u64x2{0x71374491428a2f98, 0xe9b5dba5b5c0fbcf} +@(private = "file") +K_1 :: simd.u64x2{0x59f111f13956c25b, 0xab1c5ed5923f82a4} +@(private = "file") +K_2 :: simd.u64x2{0x12835b01d807aa98, 0x550c7dc3243185be} +@(private = "file") +K_3 :: simd.u64x2{0x80deb1fe72be5d74, 0xc19bf1749bdc06a7} +@(private = "file") +K_4 :: simd.u64x2{0xefbe4786e49b69c1, 0x240ca1cc0fc19dc6} +@(private = "file") +K_5 :: simd.u64x2{0x4a7484aa2de92c6f, 0x76f988da5cb0a9dc} +@(private = "file") +K_6 :: simd.u64x2{0xa831c66d983e5152, 0xbf597fc7b00327c8} +@(private = "file") +K_7 :: simd.u64x2{0xd5a79147c6e00bf3, 0x1429296706ca6351} +@(private = "file") +K_8 :: simd.u64x2{0x2e1b213827b70a85, 0x53380d134d2c6dfc} +@(private = "file") +K_9 :: simd.u64x2{0x766a0abb650a7354, 0x92722c8581c2c92e} +@(private = "file") +K_10 :: simd.u64x2{0xa81a664ba2bfe8a1, 0xc76c51a3c24b8b70} +@(private = "file") +K_11 :: simd.u64x2{0xd6990624d192e819, 0x106aa070f40e3585} +@(private = "file") +K_12 :: simd.u64x2{0x1e376c0819a4c116, 0x34b0bcb52748774c} +@(private = "file") +K_13 :: simd.u64x2{0x4ed8aa4a391c0cb3, 0x682e6ff35b9cca4f} +@(private = "file") +K_14 :: simd.u64x2{0x78a5636f748f82ee, 0x8cc7020884c87814} +@(private = "file") +K_15 :: simd.u64x2{0xa4506ceb90befffa, 0xc67178f2bef9a3f7} + + +// is_hardware_accelerated_256 returns true iff hardware accelerated +// SHA-224/SHA-256 is supported. +is_hardware_accelerated_256 :: proc "contextless" () -> bool { + features, ok := info.cpu_features.? + if !ok { + return false + } + + req_features :: info.CPU_Features{ + .sse2, + .ssse3, + .sse41, + .sha, + } + return features >= req_features +} + +@(private, enable_target_feature="sse2,ssse3,sse4.1,sha") +sha256_transf_hw :: proc "contextless" (ctx: ^Context_256, data: []byte) #no_bounds_check { + // Load the state + tmp := intrinsics.unaligned_load((^x86.__m128i)(&ctx.h[0])) + state_1 := intrinsics.unaligned_load((^x86.__m128i)(&ctx.h[4])) + + tmp = x86._mm_shuffle_epi32(tmp, 0xb1) // CDAB + state_1 = x86._mm_shuffle_epi32(state_1, 0x1b) // EFGH + state_0 := x86._mm_alignr_epi8(tmp, state_1, 8) // ABEF + // state_1 = x86._mm_blend_epi16(state_1, tmp, 0xf0) // CDGH + state_1 = kludge_mm_blend_epi16_0xf0(state_1, tmp) + + data := data + for len(data) >= BLOCK_SIZE_256 { + state_0_save, state_1_save := state_0, state_1 + + // Rounds 0-3 + msg := intrinsics.unaligned_load((^x86.__m128i)(raw_data(data))) + msg_0 := x86._mm_shuffle_epi8(msg, MASK) + msg = x86._mm_add_epi32(msg_0, x86.__m128i(K_0)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + msg = x86._mm_shuffle_epi32(msg, 0xe) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + + // Rounds 4-7 + msg_1 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(data[16:]))) + msg_1 = x86._mm_shuffle_epi8(msg_1, MASK) + msg = x86._mm_add_epi32(msg_1, x86.__m128i(K_1)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + msg = x86._mm_shuffle_epi32(msg, 0xe) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + msg_0 = x86._mm_sha256msg1_epu32(msg_0, msg_1) + + // Rounds 8-11 + msg_2 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(data[32:]))) + msg_2 = x86._mm_shuffle_epi8(msg_2, MASK) + msg = x86._mm_add_epi32(msg_2, x86.__m128i(K_2)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + msg = x86._mm_shuffle_epi32(msg, 0xe) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + msg_1 = x86._mm_sha256msg1_epu32(msg_1, msg_2) + + // Rounds 12-15 + msg_3 := intrinsics.unaligned_load((^x86.__m128i)(raw_data(data[48:]))) + msg_3 = x86._mm_shuffle_epi8(msg_3, MASK) + msg = x86._mm_add_epi32(msg_3, x86.__m128i(K_3)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + tmp = x86._mm_alignr_epi8(msg_3, msg_2, 4) + msg_0 = x86._mm_add_epi32(msg_0, tmp) + msg_0 = x86._mm_sha256msg2_epu32(msg_0, msg_3) + msg = x86._mm_shuffle_epi32(msg, 0x0e) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + msg_2 = x86._mm_sha256msg1_epu32(msg_2, msg_3) + + // Rounds 16-19 + msg = x86._mm_add_epi32(msg_0, x86.__m128i(K_4)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + tmp = x86._mm_alignr_epi8(msg_0, msg_3, 4) + msg_1 = x86._mm_add_epi32(msg_1, tmp) + msg_1 = x86._mm_sha256msg2_epu32(msg_1, msg_0) + msg = x86._mm_shuffle_epi32(msg, 0x0e) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + msg_3 = x86._mm_sha256msg1_epu32(msg_3, msg_0) + + // Rounds 20-23 + msg = x86._mm_add_epi32(msg_1, x86.__m128i(K_5)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + tmp = x86._mm_alignr_epi8(msg_1, msg_0, 4) + msg_2 = x86._mm_add_epi32(msg_2, tmp) + msg_2 = x86._mm_sha256msg2_epu32(msg_2, msg_1) + msg = x86._mm_shuffle_epi32(msg, 0x0e) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + msg_0 = x86._mm_sha256msg1_epu32(msg_0, msg_1) + + // Rounds 24-27 + msg = x86._mm_add_epi32(msg_2, x86.__m128i(K_6)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + tmp = x86._mm_alignr_epi8(msg_2, msg_1, 4) + msg_3 = x86._mm_add_epi32(msg_3, tmp) + msg_3 = x86._mm_sha256msg2_epu32(msg_3, msg_2) + msg = x86._mm_shuffle_epi32(msg, 0x0e) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + msg_1 = x86._mm_sha256msg1_epu32(msg_1, msg_2) + + // Rounds 28-31 + msg = x86._mm_add_epi32(msg_3, x86.__m128i(K_7)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + tmp = x86._mm_alignr_epi8(msg_3, msg_2, 4) + msg_0 = x86._mm_add_epi32(msg_0, tmp) + msg_0 = x86._mm_sha256msg2_epu32(msg_0, msg_3) + msg = x86._mm_shuffle_epi32(msg, 0x0e) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + msg_2 = x86._mm_sha256msg1_epu32(msg_2, msg_3) + + // Rounds 32-35 + msg = x86._mm_add_epi32(msg_0, x86.__m128i(K_8)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + tmp = x86._mm_alignr_epi8(msg_0, msg_3, 4) + msg_1 = x86._mm_add_epi32(msg_1, tmp) + msg_1 = x86._mm_sha256msg2_epu32(msg_1, msg_0) + msg = x86._mm_shuffle_epi32(msg, 0x0e) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + msg_3 = x86._mm_sha256msg1_epu32(msg_3, msg_0) + + // Rounds 36-39 + msg = x86._mm_add_epi32(msg_1, x86.__m128i(K_9)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + tmp = x86._mm_alignr_epi8(msg_1, msg_0, 4) + msg_2 = x86._mm_add_epi32(msg_2, tmp) + msg_2 = x86._mm_sha256msg2_epu32(msg_2, msg_1) + msg = x86._mm_shuffle_epi32(msg, 0x0e) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + msg_0 = x86._mm_sha256msg1_epu32(msg_0, msg_1) + + // Rounds 40-43 + msg = x86._mm_add_epi32(msg_2, x86.__m128i(K_10)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + tmp = x86._mm_alignr_epi8(msg_2, msg_1, 4) + msg_3 = x86._mm_add_epi32(msg_3, tmp) + msg_3 = x86._mm_sha256msg2_epu32(msg_3, msg_2) + msg = x86._mm_shuffle_epi32(msg, 0x0e) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + msg_1 = x86._mm_sha256msg1_epu32(msg_1, msg_2) + + // Rounds 44-47 + msg = x86._mm_add_epi32(msg_3, x86.__m128i(K_11)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + tmp = x86._mm_alignr_epi8(msg_3, msg_2, 4) + msg_0 = x86._mm_add_epi32(msg_0, tmp) + msg_0 = x86._mm_sha256msg2_epu32(msg_0, msg_3) + msg = x86._mm_shuffle_epi32(msg, 0x0e) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + msg_2 = x86._mm_sha256msg1_epu32(msg_2, msg_3) + + // Rounds 48-51 + msg = x86._mm_add_epi32(msg_0, x86.__m128i(K_12)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + tmp = x86._mm_alignr_epi8(msg_0, msg_3, 4) + msg_1 = x86._mm_add_epi32(msg_1, tmp) + msg_1 = x86._mm_sha256msg2_epu32(msg_1, msg_0) + msg = x86._mm_shuffle_epi32(msg, 0x0e) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + msg_3 = x86._mm_sha256msg1_epu32(msg_3, msg_0) + + // Rounds 52-55 + msg = x86._mm_add_epi32(msg_1, x86.__m128i(K_13)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + tmp = x86._mm_alignr_epi8(msg_1, msg_0, 4) + msg_2 = x86._mm_add_epi32(msg_2, tmp) + msg_2 = x86._mm_sha256msg2_epu32(msg_2, msg_1) + msg = x86._mm_shuffle_epi32(msg, 0x0e) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + + /* Rounds 56-59 */ + msg = x86._mm_add_epi32(msg_2, x86.__m128i(K_14)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + tmp = x86._mm_alignr_epi8(msg_2, msg_1, 4) + msg_3 = x86._mm_add_epi32(msg_3, tmp) + msg_3 = x86._mm_sha256msg2_epu32(msg_3, msg_2) + msg = x86._mm_shuffle_epi32(msg, 0x0e) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + + // Rounds 60-63 + msg = x86._mm_add_epi32(msg_3, x86.__m128i(K_15)) + state_1 = x86._mm_sha256rnds2_epu32(state_1, state_0, msg) + msg = x86._mm_shuffle_epi32(msg, 0x0e) + state_0 = x86._mm_sha256rnds2_epu32(state_0, state_1, msg) + + state_0 = x86._mm_add_epi32(state_0, state_0_save) + state_1 = x86._mm_add_epi32(state_1, state_1_save) + + data = data[BLOCK_SIZE_256:] + } + + // Write back the updated state + tmp = x86._mm_shuffle_epi32(state_0, 0x1b) // FEBA + state_1 = x86._mm_shuffle_epi32(state_1, 0xb1) // DCHG + // state_0 = x86._mm_blend_epi16(tmp, state_1, 0xf0) // DCBA + state_0 = kludge_mm_blend_epi16_0xf0(tmp, state_1) + state_1 = x86._mm_alignr_epi8(state_1, tmp, 8) // ABEF + + intrinsics.unaligned_store((^x86.__m128i)(&ctx.h[0]), state_0) + intrinsics.unaligned_store((^x86.__m128i)(&ctx.h[4]), state_1) +} + +@(private = "file") +kludge_mm_blend_epi16_0xf0 :: #force_inline proc "contextless"(a, b: x86.__m128i) -> x86.__m128i { + // HACK HACK HACK: LLVM got rid of `llvm.x86.sse41.pblendw`. + a_ := simd.to_array(a) + b_ := simd.to_array(b) + return x86.__m128i{a_[0], b_[1]} +} diff --git a/core/crypto/siphash/siphash.odin b/core/crypto/siphash/siphash.odin index c145ab3f0..f9fe50cb0 100644 --- a/core/crypto/siphash/siphash.odin +++ b/core/crypto/siphash/siphash.odin @@ -219,18 +219,14 @@ verify_4_8 :: proc { */ init :: proc(ctx: ^Context, key: []byte, c_rounds, d_rounds: int) { - if len(key) != KEY_SIZE { - panic("crypto/siphash; invalid key size") - } + ensure(len(key) == KEY_SIZE,"crypto/siphash; invalid key size") ctx.c_rounds = c_rounds ctx.d_rounds = d_rounds is_valid_setting := (ctx.c_rounds == 1 && ctx.d_rounds == 3) || (ctx.c_rounds == 2 && ctx.d_rounds == 4) || (ctx.c_rounds == 4 && ctx.d_rounds == 8) - if !is_valid_setting { - panic("crypto/siphash: incorrect rounds set up") - } + ensure(is_valid_setting, "crypto/siphash: incorrect rounds set up") ctx.k0 = endian.unchecked_get_u64le(key[:8]) ctx.k1 = endian.unchecked_get_u64le(key[8:]) ctx.v0 = 0x736f6d6570736575 ~ ctx.k0 @@ -245,7 +241,7 @@ init :: proc(ctx: ^Context, key: []byte, c_rounds, d_rounds: int) { } update :: proc(ctx: ^Context, data: []byte) { - assert(ctx.is_initialized, "crypto/siphash: context is not initialized") + ensure(ctx.is_initialized) data := data ctx.total_length += len(data) @@ -269,7 +265,7 @@ update :: proc(ctx: ^Context, data: []byte) { } final :: proc(ctx: ^Context, dst: ^u64) { - assert(ctx.is_initialized, "crypto/siphash: context is not initialized") + ensure(ctx.is_initialized) tmp: [BLOCK_SIZE]byte copy(tmp[:], ctx.buf[:ctx.last_block]) @@ -336,9 +332,8 @@ _get_byte :: #force_inline proc "contextless" (byte_num: byte, into: u64) -> byt @(private) _collect_output :: #force_inline proc(dst: []byte, hash: u64) { - if len(dst) < DIGEST_SIZE { - panic("crypto/siphash: invalid tag size") - } + ensure(len(dst) >= DIGEST_SIZE, "crypto/siphash: invalid tag size") + dst[0] = _get_byte(7, hash) dst[1] = _get_byte(6, hash) dst[2] = _get_byte(5, hash) diff --git a/core/crypto/sm3/sm3.odin b/core/crypto/sm3/sm3.odin index f910d735b..6487c5e8c 100644 --- a/core/crypto/sm3/sm3.odin +++ b/core/crypto/sm3/sm3.odin @@ -53,7 +53,7 @@ init :: proc(ctx: ^Context) { // update adds more data to the Context. update :: proc(ctx: ^Context, data: []byte) { - assert(ctx.is_initialized) + ensure(ctx.is_initialized) data := data ctx.length += u64(len(data)) @@ -83,11 +83,8 @@ update :: proc(ctx: ^Context, data: []byte) { // Iff finalize_clone is set, final will work on a copy of the Context, // which is useful for for calculating rolling digests. final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) { - assert(ctx.is_initialized) - - if len(hash) < DIGEST_SIZE { - panic("crypto/sm3: invalid destination digest size") - } + ensure(ctx.is_initialized) + ensure(len(hash) >= DIGEST_SIZE, "crypto/sm3: invalid destination digest size") ctx := ctx if finalize_clone { @@ -110,7 +107,7 @@ final :: proc(ctx: ^Context, hash: []byte, finalize_clone: bool = false) { length <<= 3 endian.unchecked_put_u64be(pad[:], length) update(ctx, pad[0:8]) - assert(ctx.bitlength == 0) + assert(ctx.bitlength == 0) // Check for bugs for i := 0; i < DIGEST_SIZE / 4; i += 1 { endian.unchecked_put_u32be(hash[i * 4:], ctx.state[i]) @@ -136,7 +133,7 @@ reset :: proc(ctx: ^Context) { SM3 implementation */ -@(private) +@(private, rodata) IV := [8]u32 { 0x7380166f, 0x4914b2b9, 0x172442d7, 0xda8a0600, 0xa96f30bc, 0x163138aa, 0xe38dee4d, 0xb0fb0e4e, diff --git a/core/crypto/x25519/x25519.odin b/core/crypto/x25519/x25519.odin index 412a767b8..6805c3ff8 100644 --- a/core/crypto/x25519/x25519.odin +++ b/core/crypto/x25519/x25519.odin @@ -15,7 +15,7 @@ SCALAR_SIZE :: 32 // POINT_SIZE is the size of a X25519 point (public key/shared secret) in bytes. POINT_SIZE :: 32 -@(private) +@(private, rodata) _BASE_POINT: [32]byte = {9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} @(private) @@ -101,15 +101,9 @@ _scalarmult :: proc "contextless" (out, scalar, point: ^[32]byte) { // scalarmult "multiplies" the provided scalar and point, and writes the // resulting point to dst. scalarmult :: proc(dst, scalar, point: []byte) { - if len(scalar) != SCALAR_SIZE { - panic("crypto/x25519: invalid scalar size") - } - if len(point) != POINT_SIZE { - panic("crypto/x25519: invalid point size") - } - if len(dst) != POINT_SIZE { - panic("crypto/x25519: invalid destination point size") - } + ensure(len(scalar) == SCALAR_SIZE, "crypto/x25519: invalid scalar size") + ensure(len(point) == POINT_SIZE, "crypto/x25519: invalid point size") + ensure(len(dst) == POINT_SIZE, "crypto/x25519: invalid destination point size") // "clamp" the scalar e: [32]byte = --- diff --git a/core/crypto/x448/x448.odin b/core/crypto/x448/x448.odin new file mode 100644 index 000000000..43c5d25e0 --- /dev/null +++ b/core/crypto/x448/x448.odin @@ -0,0 +1,155 @@ +/* +package x448 implements the X448 (aka curve448) Elliptic-Curve +Diffie-Hellman key exchange protocol. + +See: +- [[ https://www.rfc-editor.org/rfc/rfc7748 ]] +*/ +package x448 + +import field "core:crypto/_fiat/field_curve448" +import "core:mem" + +// SCALAR_SIZE is the size of a X448 scalar (private key) in bytes. +SCALAR_SIZE :: 56 +// POINT_SIZE is the size of a X448 point (public key/shared secret) in bytes. +POINT_SIZE :: 56 + +@(private, rodata) +_BASE_POINT: [56]byte = { + 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +} + +@(private) +_scalar_bit :: #force_inline proc "contextless" (s: ^[56]byte, i: int) -> u8 { + if i < 0 { + return 0 + } + return (s[i >> 3] >> uint(i & 7)) & 1 +} + +@(private) +_scalarmult :: proc "contextless" (out, scalar, point: ^[56]byte) { + // Montgomery pseudo-multiplication, using the RFC 7748 formula. + t1, t2: field.Loose_Field_Element = ---, --- + + // x_1 = u + // x_2 = 1 + // z_2 = 0 + // x_3 = u + // z_3 = 1 + x1: field.Tight_Field_Element = --- + field.fe_from_bytes(&x1, point) + + x2, x3, z2, z3: field.Tight_Field_Element = ---, ---, ---, --- + field.fe_one(&x2) + field.fe_zero(&z2) + field.fe_set(&x3, &x1) + field.fe_one(&z3) + + // swap = 0 + swap: int + + // For t = bits-1 down to 0:a + for t := 448 - 1; t >= 0; t -= 1 { + // k_t = (k >> t) & 1 + k_t := int(_scalar_bit(scalar, t)) + // swap ^= k_t + swap ~= k_t + // Conditional swap; see text below. + // (x_2, x_3) = cswap(swap, x_2, x_3) + field.fe_cond_swap(&x2, &x3, swap) + // (z_2, z_3) = cswap(swap, z_2, z_3) + field.fe_cond_swap(&z2, &z3, swap) + // swap = k_t + swap = k_t + + // Note: This deliberately omits reductions after add/sub operations + // if the result is only ever used as the input to a mul/square since + // the implementations of those can deal with non-reduced inputs. + // + // fe_tighten_cast is only used to store a fully reduced + // output in a Loose_Field_Element, or to provide such a + // Loose_Field_Element as a Tight_Field_Element argument. + + // A = x_2 + z_2 + field.fe_add(&t1, &x2, &z2) + // B = x_2 - z_2 + field.fe_sub(&t2, &x2, &z2) + // D = x_3 - z_3 + field.fe_sub(field.fe_relax_cast(&z2), &x3, &z3) // (z2 unreduced) + // DA = D * A + field.fe_carry_mul(&x2, field.fe_relax_cast(&z2), &t1) + // C = x_3 + z_3 + field.fe_add(field.fe_relax_cast(&z3), &x3, &z3) // (z3 unreduced) + // CB = C * B + field.fe_carry_mul(&x3, &t2, field.fe_relax_cast(&z3)) + // z_3 = x_1 * (DA - CB)^2 + field.fe_sub(field.fe_relax_cast(&z3), &x2, &x3) // (z3 unreduced) + field.fe_carry_square(&z3, field.fe_relax_cast(&z3)) + field.fe_carry_mul(&z3, field.fe_relax_cast(&x1), field.fe_relax_cast(&z3)) + // x_3 = (DA + CB)^2 + field.fe_add(field.fe_relax_cast(&z2), &x2, &x3) // (z2 unreduced) + field.fe_carry_square(&x3, field.fe_relax_cast(&z2)) + + // AA = A^2 + field.fe_carry_square(&z2, &t1) + // BB = B^2 + field.fe_carry_square(field.fe_tighten_cast(&t1), &t2) // (t1 reduced) + // x_2 = AA * BB + field.fe_carry_mul(&x2, field.fe_relax_cast(&z2), &t1) + // E = AA - BB + field.fe_sub(&t2, &z2, field.fe_tighten_cast(&t1)) // (t1 (input) is reduced) + // z_2 = E * (AA + a24 * E) + field.fe_carry_mul_small(field.fe_tighten_cast(&t1), &t2, 39081) // (t1 reduced) + field.fe_add(&t1, &z2, field.fe_tighten_cast(&t1)) // (t1 (input) is reduced) + field.fe_carry_mul(&z2, &t2, &t1) + } + + // Conditional swap; see text below. + // (x_2, x_3) = cswap(swap, x_2, x_3) + field.fe_cond_swap(&x2, &x3, swap) + // (z_2, z_3) = cswap(swap, z_2, z_3) + field.fe_cond_swap(&z2, &z3, swap) + + // Return x_2 * (z_2^(p - 2)) + field.fe_carry_inv(&z2, field.fe_relax_cast(&z2)) + field.fe_carry_mul(&x2, field.fe_relax_cast(&x2), field.fe_relax_cast(&z2)) + field.fe_to_bytes(out, &x2) + + field.fe_clear_vec([]^field.Tight_Field_Element{&x1, &x2, &x3, &z2, &z3}) + field.fe_clear_vec([]^field.Loose_Field_Element{&t1, &t2}) +} + +// scalarmult "multiplies" the provided scalar and point, and writes the +// resulting point to dst. +scalarmult :: proc(dst, scalar, point: []byte) { + ensure(len(scalar) == SCALAR_SIZE, "crypto/x448: invalid scalar size") + ensure(len(point) == POINT_SIZE, "crypto/x448: invalid point size") + ensure(len(dst) == POINT_SIZE, "crypto/x448: invalid destination point size") + + // "clamp" the scalar + e: [56]byte = --- + copy_slice(e[:], scalar) + e[0] &= 252 + e[55] |= 128 + + p: [56]byte = --- + copy_slice(p[:], point) + + d: [56]byte = --- + _scalarmult(&d, &e, &p) + copy_slice(dst, d[:]) + + mem.zero_explicit(&e, size_of(e)) + mem.zero_explicit(&d, size_of(d)) +} + +// scalarmult_basepoint "multiplies" the provided scalar with the X448 +// base point and writes the resulting point to dst. +scalarmult_basepoint :: proc(dst, scalar: []byte) { + scalarmult(dst, scalar, _BASE_POINT[:]) +} diff --git a/core/simd/x86/ssse3.odin b/core/simd/x86/ssse3.odin index 07c846e7b..03ba5dcfb 100644 --- a/core/simd/x86/ssse3.odin +++ b/core/simd/x86/ssse3.odin @@ -21,7 +21,7 @@ _mm_abs_epi32 :: #force_inline proc "c" (a: __m128i) -> __m128i { _mm_shuffle_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i { return transmute(__m128i)pshufb128(transmute(u8x16)a, transmute(u8x16)b) } -@(require_results, enable_target_feature="ssse3") +@(require_results, enable_target_feature="sse2,ssse3") _mm_alignr_epi8 :: #force_inline proc "c" (a, b: __m128i, $IMM8: u32) -> __m128i { shift :: IMM8 diff --git a/core/sys/info/cpu_intel.odin b/core/sys/info/cpu_intel.odin index 95b53dda0..c8b8282fe 100644 --- a/core/sys/info/cpu_intel.odin +++ b/core/sys/info/cpu_intel.odin @@ -23,6 +23,7 @@ CPU_Feature :: enum u64 { popcnt, // Hamming weight instruction POPCNT. rdrand, // RDRAND instruction (on-chip random number generator) rdseed, // RDSEED instruction (on-chip random number generator) + sha, // SHA Extensions (SHA-1, SHA-224, SHA-256) sse2, // Streaming SIMD extension 2 (always available on amd64) sse3, // Streaming SIMD extension 3 ssse3, // Supplemental streaming SIMD extension 3 @@ -115,6 +116,7 @@ init_cpu_features :: proc "c" () { _, ebx7, ecx7, edx7 := cpuid(7, 0) try_set(&set, .bmi1, 3, ebx7) + try_set(&set, .sha, 29, ebx7) if os_supports_avx { try_set(&set, .avx2, 5, ebx7) } diff --git a/examples/all/all_main.odin b/examples/all/all_main.odin index 4a8a198d3..0e7648f96 100644 --- a/examples/all/all_main.odin +++ b/examples/all/all_main.odin @@ -26,12 +26,14 @@ import topological_sort "core:container/topological_sort" import crypto "core:crypto" import aead "core:crypto/aead" +import aegis "core:crypto/aegis" import aes "core:crypto/aes" import blake2b "core:crypto/blake2b" import blake2s "core:crypto/blake2s" import chacha20 "core:crypto/chacha20" import chacha20poly1305 "core:crypto/chacha20poly1305" import crypto_hash "core:crypto/hash" +import deoxysii "core:crypto/deoxysii" import ed25519 "core:crypto/ed25519" import hkdf "core:crypto/hkdf" import hmac "core:crypto/hmac" @@ -48,6 +50,7 @@ import shake "core:crypto/shake" import sm3 "core:crypto/sm3" import tuplehash "core:crypto/tuplehash" import x25519 "core:crypto/x25519" +import x448 "core:crypto/x448" import pe "core:debug/pe" import trace "core:debug/trace" @@ -169,11 +172,13 @@ _ :: topological_sort _ :: crypto _ :: crypto_hash _ :: aead +_ :: aegis _ :: aes _ :: blake2b _ :: blake2s _ :: chacha20 _ :: chacha20poly1305 +_ :: deoxysii _ :: ed25519 _ :: hmac _ :: hkdf @@ -190,6 +195,7 @@ _ :: shake _ :: sm3 _ :: tuplehash _ :: x25519 +_ :: x448 _ :: pe _ :: trace _ :: dynlib diff --git a/tests/benchmark/crypto/benchmark_aead.odin b/tests/benchmark/crypto/benchmark_aead.odin new file mode 100644 index 000000000..bfd888a43 --- /dev/null +++ b/tests/benchmark/crypto/benchmark_aead.odin @@ -0,0 +1,96 @@ +package benchmark_core_crypto + +import "base:runtime" +import "core:crypto" +import "core:testing" +import "core:text/table" +import "core:time" + +import "core:crypto/aead" + +@(private = "file") +ITERS :: 10000 +@(private = "file") +SIZES := []int{64, 1024, 65536} + +@(test) +benchmark_crypto_aead :: proc(t: ^testing.T) { + runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD() + + tbl: table.Table + table.init(&tbl) + defer table.destroy(&tbl) + + table.caption(&tbl, "AEAD") + table.aligned_header_of_values(&tbl, .Right, "Algorithm", "Size", "Time", "Throughput") + + for algo, i in aead.Algorithm { + if algo == .Invalid { + continue + } + if i > 1 { + table.row(&tbl) + } + + algo_name := aead.ALGORITHM_NAMES[algo] + key_sz := aead.KEY_SIZES[algo] + + key := make([]byte, key_sz, context.temp_allocator) + crypto.rand_bytes(key) + + // TODO: Benchmark all available imlementations? + ctx: aead.Context + aead.init(&ctx, algo, key) + + for sz, _ in SIZES { + options := &time.Benchmark_Options{ + rounds = ITERS, + bytes = aead.IV_SIZES[algo] + sz, + setup = setup_sized_buf, + bench = do_bench_aead, + teardown = teardown_sized_buf, + } + context.user_ptr = &ctx + + err := time.benchmark(options, context.allocator) + testing.expect(t, err == nil) + + time_per_iter := options.duration / ITERS + table.aligned_row_of_values( + &tbl, + .Right, + algo_name, + table.format(&tbl, "%d", sz), + table.format(&tbl, "%8M", time_per_iter), + table.format(&tbl, "%5.3f MiB/s", options.megabytes_per_second), + ) + } + } + + log_table(&tbl) +} + +@(private = "file") +do_bench_aead :: proc( + options: ^time.Benchmark_Options, + allocator := context.allocator, +) -> ( + err: time.Benchmark_Error, +) { + tag_: [aead.MAX_TAG_SIZE]byte + + ctx := (^aead.Context)(context.user_ptr) + iv_sz := aead.iv_size(ctx) + + iv := options.input[:iv_sz] + buf := options.input[iv_sz:] + tag := tag_[:aead.tag_size(ctx)] + + for _ in 0 ..= options.rounds { + aead.seal_ctx(ctx, buf, tag, iv, nil, buf) + } + options.count = options.rounds + options.processed = options.rounds * (options.bytes - iv_sz) + + return +} diff --git a/tests/benchmark/crypto/benchmark_crypto.odin b/tests/benchmark/crypto/benchmark_crypto.odin deleted file mode 100644 index b139ea669..000000000 --- a/tests/benchmark/crypto/benchmark_crypto.odin +++ /dev/null @@ -1,415 +0,0 @@ -package benchmark_core_crypto - -import "base:runtime" -import "core:encoding/hex" -import "core:fmt" -import "core:log" -import "core:strings" -import "core:testing" -import "core:time" - -import "core:crypto/aes" -import "core:crypto/chacha20" -import "core:crypto/chacha20poly1305" -import "core:crypto/ed25519" -import "core:crypto/poly1305" -import "core:crypto/x25519" - -// Cryptographic primitive benchmarks. - -@(test) -benchmark_crypto :: proc(t: ^testing.T) { - runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD() - - str: strings.Builder - strings.builder_init(&str, context.allocator) - defer { - log.info(strings.to_string(str)) - strings.builder_destroy(&str) - } - - { - name := "AES256-CTR 64 bytes" - options := &time.Benchmark_Options { - rounds = 1_000, - bytes = 64, - setup = _setup_sized_buf, - bench = _benchmark_aes256_ctr, - teardown = _teardown_sized_buf, - } - - err := time.benchmark(options, context.allocator) - testing.expect(t, err == nil, name) - benchmark_print(&str, name, options) - - name = "AES256-CTR 1024 bytes" - options.bytes = 1024 - err = time.benchmark(options, context.allocator) - testing.expect(t, err == nil, name) - benchmark_print(&str, name, options) - - name = "AES256-CTR 65536 bytes" - options.bytes = 65536 - err = time.benchmark(options, context.allocator) - testing.expect(t, err == nil, name) - benchmark_print(&str, name, options) - } - { - name := "ChaCha20 64 bytes" - options := &time.Benchmark_Options { - rounds = 1_000, - bytes = 64, - setup = _setup_sized_buf, - bench = _benchmark_chacha20, - teardown = _teardown_sized_buf, - } - - err := time.benchmark(options, context.allocator) - testing.expect(t, err == nil, name) - benchmark_print(&str, name, options) - - name = "ChaCha20 1024 bytes" - options.bytes = 1024 - err = time.benchmark(options, context.allocator) - testing.expect(t, err == nil, name) - benchmark_print(&str, name, options) - - name = "ChaCha20 65536 bytes" - options.bytes = 65536 - err = time.benchmark(options, context.allocator) - testing.expect(t, err == nil, name) - benchmark_print(&str, name, options) - } - { - name := "Poly1305 64 zero bytes" - options := &time.Benchmark_Options { - rounds = 1_000, - bytes = 64, - setup = _setup_sized_buf, - bench = _benchmark_poly1305, - teardown = _teardown_sized_buf, - } - - err := time.benchmark(options, context.allocator) - testing.expect(t, err == nil, name) - benchmark_print(&str, name, options) - - name = "Poly1305 1024 zero bytes" - options.bytes = 1024 - err = time.benchmark(options, context.allocator) - testing.expect(t, err == nil, name) - benchmark_print(&str, name, options) - } - { - name := "chacha20poly1305 64 bytes" - options := &time.Benchmark_Options { - rounds = 1_000, - bytes = 64, - setup = _setup_sized_buf, - bench = _benchmark_chacha20poly1305, - teardown = _teardown_sized_buf, - } - - err := time.benchmark(options, context.allocator) - testing.expect(t, err == nil, name) - benchmark_print(&str, name, options) - - name = "chacha20poly1305 1024 bytes" - options.bytes = 1024 - err = time.benchmark(options, context.allocator) - testing.expect(t, err == nil, name) - benchmark_print(&str, name, options) - - name = "chacha20poly1305 65536 bytes" - options.bytes = 65536 - err = time.benchmark(options, context.allocator) - testing.expect(t, err == nil, name) - benchmark_print(&str, name, options) - } - { - name := "AES256-GCM 64 bytes" - options := &time.Benchmark_Options { - rounds = 1_000, - bytes = 64, - setup = _setup_sized_buf, - bench = _benchmark_aes256_gcm, - teardown = _teardown_sized_buf, - } - - key := [aes.KEY_SIZE_256]byte { - 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, - 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, - 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, - 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, - } - ctx: aes.Context_GCM - aes.init_gcm(&ctx, key[:]) - - context.user_ptr = &ctx - - err := time.benchmark(options, context.allocator) - testing.expect(t, err == nil, name) - benchmark_print(&str, name, options) - - name = "AES256-GCM 1024 bytes" - options.bytes = 1024 - err = time.benchmark(options, context.allocator) - testing.expect(t, err == nil, name) - benchmark_print(&str, name, options) - - name = "AES256-GCM 65536 bytes" - options.bytes = 65536 - err = time.benchmark(options, context.allocator) - testing.expect(t, err == nil, name) - benchmark_print(&str, name, options) - } - { - iters :: 10000 - - priv_str := "cafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabe" - priv_bytes, _ := hex.decode(transmute([]byte)(priv_str), context.temp_allocator) - priv_key: ed25519.Private_Key - start := time.now() - for i := 0; i < iters; i = i + 1 { - ok := ed25519.private_key_set_bytes(&priv_key, priv_bytes) - assert(ok, "private key should deserialize") - } - elapsed := time.since(start) - fmt.sbprintfln(&str, - "ed25519.private_key_set_bytes: ~%f us/op", - time.duration_microseconds(elapsed) / iters, - ) - - pub_bytes := priv_key._pub_key._b[:] // "I know what I am doing" - pub_key: ed25519.Public_Key - start = time.now() - for i := 0; i < iters; i = i + 1 { - ok := ed25519.public_key_set_bytes(&pub_key, pub_bytes[:]) - assert(ok, "public key should deserialize") - } - elapsed = time.since(start) - fmt.sbprintfln(&str, - "ed25519.public_key_set_bytes: ~%f us/op", - time.duration_microseconds(elapsed) / iters, - ) - - msg := "Got a job for you, 621." - sig_bytes: [ed25519.SIGNATURE_SIZE]byte - msg_bytes := transmute([]byte)(msg) - start = time.now() - for i := 0; i < iters; i = i + 1 { - ed25519.sign(&priv_key, msg_bytes, sig_bytes[:]) - } - elapsed = time.since(start) - fmt.sbprintfln(&str, - "ed25519.sign: ~%f us/op", - time.duration_microseconds(elapsed) / iters, - ) - - start = time.now() - for i := 0; i < iters; i = i + 1 { - ok := ed25519.verify(&pub_key, msg_bytes, sig_bytes[:]) - assert(ok, "signature should validate") - } - elapsed = time.since(start) - fmt.sbprintfln(&str, - "ed25519.verify: ~%f us/op", - time.duration_microseconds(elapsed) / iters, - ) - } - { - point_str := "deadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeef" - scalar_str := "cafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabe" - - point, _ := hex.decode(transmute([]byte)(point_str), context.temp_allocator) - scalar, _ := hex.decode(transmute([]byte)(scalar_str), context.temp_allocator) - out: [x25519.POINT_SIZE]byte = --- - - iters :: 10000 - start := time.now() - for i := 0; i < iters; i = i + 1 { - x25519.scalarmult(out[:], scalar[:], point[:]) - } - elapsed := time.since(start) - - fmt.sbprintfln(&str, - "x25519.scalarmult: ~%f us/op", - time.duration_microseconds(elapsed) / iters, - ) - } -} - -@(private) -_setup_sized_buf :: proc( - options: ^time.Benchmark_Options, - allocator := context.allocator, -) -> ( - err: time.Benchmark_Error, -) { - assert(options != nil) - - options.input = make([]u8, options.bytes, allocator) - return nil if len(options.input) == options.bytes else .Allocation_Error -} - -@(private) -_teardown_sized_buf :: proc( - options: ^time.Benchmark_Options, - allocator := context.allocator, -) -> ( - err: time.Benchmark_Error, -) { - assert(options != nil) - - delete(options.input) - return nil -} - -@(private) -_benchmark_chacha20 :: proc( - options: ^time.Benchmark_Options, - allocator := context.allocator, -) -> ( - err: time.Benchmark_Error, -) { - buf := options.input - key := [chacha20.KEY_SIZE]byte { - 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, - 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, - 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, - 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, - } - iv := [chacha20.IV_SIZE]byte { - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, - } - - ctx: chacha20.Context = --- - chacha20.init(&ctx, key[:], iv[:]) - - for _ in 0 ..= options.rounds { - chacha20.xor_bytes(&ctx, buf, buf) - } - options.count = options.rounds - options.processed = options.rounds * options.bytes - return nil -} - -@(private) -_benchmark_poly1305 :: proc( - options: ^time.Benchmark_Options, - allocator := context.allocator, -) -> ( - err: time.Benchmark_Error, -) { - buf := options.input - key := [poly1305.KEY_SIZE]byte { - 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, - 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, - 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, - 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, - } - - tag: [poly1305.TAG_SIZE]byte = --- - for _ in 0 ..= options.rounds { - poly1305.sum(tag[:], buf, key[:]) - } - options.count = options.rounds - options.processed = options.rounds * options.bytes - //options.hash = u128(h) - return nil -} - -@(private) -_benchmark_chacha20poly1305 :: proc( - options: ^time.Benchmark_Options, - allocator := context.allocator, -) -> ( - err: time.Benchmark_Error, -) { - buf := options.input - key := [chacha20.KEY_SIZE]byte { - 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, - 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, - 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, - 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, - } - iv := [chacha20.IV_SIZE]byte { - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, - } - - ctx: chacha20poly1305.Context = --- - chacha20poly1305.init(&ctx, key[:]) // Basically 0 overhead. - - tag: [chacha20poly1305.TAG_SIZE]byte = --- - - for _ in 0 ..= options.rounds { - chacha20poly1305.seal(&ctx, buf, tag[:], iv[:], nil, buf) - } - options.count = options.rounds - options.processed = options.rounds * options.bytes - return nil -} - -@(private) -_benchmark_aes256_ctr :: proc( - options: ^time.Benchmark_Options, - allocator := context.allocator, -) -> ( - err: time.Benchmark_Error, -) { - buf := options.input - key := [aes.KEY_SIZE_256]byte { - 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, - 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, - 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, - 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, - } - iv := [aes.CTR_IV_SIZE]byte { - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - } - - ctx: aes.Context_CTR = --- - aes.init_ctr(&ctx, key[:], iv[:]) - - for _ in 0 ..= options.rounds { - aes.xor_bytes_ctr(&ctx, buf, buf) - } - options.count = options.rounds - options.processed = options.rounds * options.bytes - return nil -} - -_benchmark_aes256_gcm :: proc( - options: ^time.Benchmark_Options, - allocator := context.allocator, -) -> ( - err: time.Benchmark_Error, -) { - buf := options.input - iv: [aes.GCM_IV_SIZE]byte - tag: [aes.GCM_TAG_SIZE]byte = --- - - ctx := (^aes.Context_GCM)(context.user_ptr) - - for _ in 0 ..= options.rounds { - aes.seal_gcm(ctx, buf, tag[:], iv[:], nil, buf) - } - options.count = options.rounds - options.processed = options.rounds * options.bytes - return nil -} - -@(private) -benchmark_print :: proc(str: ^strings.Builder, name: string, options: ^time.Benchmark_Options, loc := #caller_location) { - fmt.sbprintfln(str, "[%v] %v rounds, %v bytes processed in %v ns\n\t\t%5.3f rounds/s, %5.3f MiB/s\n", - name, - options.rounds, - options.processed, - time.duration_nanoseconds(options.duration), - options.rounds_per_second, - options.megabytes_per_second, - ) -} diff --git a/tests/benchmark/crypto/benchmark_ecc.odin b/tests/benchmark/crypto/benchmark_ecc.odin new file mode 100644 index 000000000..16ca798dc --- /dev/null +++ b/tests/benchmark/crypto/benchmark_ecc.odin @@ -0,0 +1,163 @@ +package benchmark_core_crypto + +import "base:runtime" +import "core:encoding/hex" +import "core:testing" +import "core:text/table" +import "core:time" + +import "core:crypto/ed25519" +import "core:crypto/x25519" +import "core:crypto/x448" + +@(private = "file") +ECDH_ITERS :: 10000 +@(private = "file") +DSA_ITERS :: 10000 + +@(test) +benchmark_crypto_ecc :: proc(t: ^testing.T) { + runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD() + + bench_ecdh() + bench_dsa() +} + +@(private = "file") +bench_ecdh :: proc() { + tbl: table.Table + table.init(&tbl) + defer table.destroy(&tbl) + + table.caption(&tbl, "ECDH") + table.aligned_header_of_values(&tbl, .Right, "Algorithm", "Scalar-Basepoint", "Scalar-Point") + + append_tbl := proc(tbl: ^table.Table, algo_name: string, bp, sc: time.Duration) { + table.aligned_row_of_values( + tbl, + .Right, + algo_name, + table.format(tbl, "%8M", bp), + table.format(tbl, "%8M", sc), + ) + } + + scalar_bp, scalar := bench_x25519() + append_tbl(&tbl, "X25519", scalar_bp, scalar) + + scalar_bp, scalar = bench_x448() + append_tbl(&tbl, "X448", scalar_bp, scalar) + + log_table(&tbl) +} + +@(private = "file") +bench_x25519 :: proc() -> (bp, sc: time.Duration) { + point_str := "deadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeef" + scalar_str := "cafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabe" + + point, _ := hex.decode(transmute([]byte)(point_str), context.temp_allocator) + scalar, _ := hex.decode(transmute([]byte)(scalar_str), context.temp_allocator) + out: [x25519.POINT_SIZE]byte = --- + + start := time.tick_now() + for _ in 0 ..< ECDH_ITERS { + x25519.scalarmult_basepoint(out[:], scalar[:]) + } + bp = time.tick_since(start) / ECDH_ITERS + + start = time.tick_now() + for _ in 0 ..< ECDH_ITERS { + x25519.scalarmult(out[:], scalar[:], point[:]) + } + sc = time.tick_since(start) / ECDH_ITERS + + return +} + +@(private = "file") +bench_x448 :: proc() -> (bp, sc: time.Duration) { + point_str := "deadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeef" + scalar_str := "cafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabe" + + point, _ := hex.decode(transmute([]byte)(point_str), context.temp_allocator) + scalar, _ := hex.decode(transmute([]byte)(scalar_str), context.temp_allocator) + out: [x448.POINT_SIZE]byte = --- + + start := time.tick_now() + for _ in 0 ..< ECDH_ITERS { + x448.scalarmult_basepoint(out[:], scalar[:]) + } + bp = time.tick_since(start) / ECDH_ITERS + + start = time.tick_now() + for _ in 0 ..< ECDH_ITERS { + x448.scalarmult(out[:], scalar[:], point[:]) + } + sc = time.tick_since(start) / ECDH_ITERS + + return +} + +@(private = "file") +bench_dsa :: proc() { + tbl: table.Table + table.init(&tbl) + defer table.destroy(&tbl) + + table.caption(&tbl, "ECDSA/EdDSA") + table.aligned_header_of_values(&tbl, .Right, "Algorithm", "Op", "Time") + + append_tbl := proc(tbl: ^table.Table, algo_name, op: string, t: time.Duration) { + table.aligned_row_of_values( + tbl, + .Right, + algo_name, + op, + table.format(tbl, "%8M", t), + ) + } + + sk, sig, verif := bench_ed25519() + append_tbl(&tbl, "ed25519", "private_key_set_bytes", sk) + append_tbl(&tbl, "ed25519", "sign", sig) + append_tbl(&tbl, "ed25519", "verify", verif) + + log_table(&tbl) +} + +@(private = "file") +bench_ed25519 :: proc() -> (sk, sig, verif: time.Duration) { + priv_str := "cafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabe" + priv_bytes, _ := hex.decode(transmute([]byte)(priv_str), context.temp_allocator) + priv_key: ed25519.Private_Key + start := time.tick_now() + for _ in 0 ..< DSA_ITERS { + ok := ed25519.private_key_set_bytes(&priv_key, priv_bytes) + assert(ok, "private key should deserialize") + } + sk = time.tick_since(start) / DSA_ITERS + + pub_bytes := priv_key._pub_key._b[:] // "I know what I am doing" + pub_key: ed25519.Public_Key + ok := ed25519.public_key_set_bytes(&pub_key, pub_bytes[:]) + assert(ok, "public key should deserialize") + + msg := "Got a job for you, 621." + sig_bytes: [ed25519.SIGNATURE_SIZE]byte + msg_bytes := transmute([]byte)(msg) + start = time.tick_now() + for _ in 0 ..< DSA_ITERS { + ed25519.sign(&priv_key, msg_bytes, sig_bytes[:]) + } + sig = time.tick_since(start) / DSA_ITERS + + start = time.tick_now() + for _ in 0 ..< DSA_ITERS { + ok = ed25519.verify(&pub_key, msg_bytes, sig_bytes[:]) + assert(ok, "signature should validate") + } + verif = time.tick_since(start) / DSA_ITERS + + return +} diff --git a/tests/benchmark/crypto/benchmark_hash.odin b/tests/benchmark/crypto/benchmark_hash.odin new file mode 100644 index 000000000..f9c560e6d --- /dev/null +++ b/tests/benchmark/crypto/benchmark_hash.odin @@ -0,0 +1,101 @@ +package benchmark_core_crypto + +import "base:runtime" +import "core:testing" +import "core:text/table" +import "core:time" + +import "core:crypto/hash" + +@(private = "file") +ITERS :: 10000 +@(private = "file") +SIZES := []int{64, 1024, 65536} + +@(test) +benchmark_crypto_hash :: proc(t: ^testing.T) { + runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD() + + tbl: table.Table + table.init(&tbl) + defer table.destroy(&tbl) + + table.caption(&tbl, "Hash") + table.aligned_header_of_values(&tbl, .Right, "Algorithm", "Size", "Time", "Throughput") + + for algo, i in hash.Algorithm { + // Skip the sentinel value, and uncommon algorithms + #partial switch algo { + case .Invalid: + continue + case .Legacy_KECCAK_224, .Legacy_KECCAK_256, .Legacy_KECCAK_384, .Legacy_KECCAK_512: + // Skip: Legacy and not worth using over SHA3 + continue + case .Insecure_MD5, .Insecure_SHA1: + // Skip: Legacy and not worth using at all + continue + case .SHA224, .SHA384, .SHA3_224, .SHA3_384: + // Skip: Uncommon SHA2/SHA3 variants + continue + case .SM3: + // Skip: Liberty Prime is online. All systems nominal. + // Weapons hot. Mission: the destruction of any and + // all Chinese communists. + continue + } + if i > 1 { + table.row(&tbl) + } + + algo_name := hash.ALGORITHM_NAMES[algo] + + for sz, _ in SIZES { + options := &time.Benchmark_Options{ + rounds = ITERS, + bytes = sz, + setup = setup_sized_buf, + bench = do_bench_hash, + teardown = teardown_sized_buf, + } + tmp := algo + context.user_ptr = &tmp + + err := time.benchmark(options, context.allocator) + testing.expect(t, err == nil) + + time_per_iter := options.duration / ITERS + table.aligned_row_of_values( + &tbl, + .Right, + algo_name, + table.format(&tbl, "%d", sz), + table.format(&tbl, "%8M", time_per_iter), + table.format(&tbl, "%5.3f MiB/s", options.megabytes_per_second), + ) + } + } + + log_table(&tbl) +} + +@(private = "file") +do_bench_hash :: proc( + options: ^time.Benchmark_Options, + allocator := context.allocator, +) -> ( + err: time.Benchmark_Error, +) { + digest_: [hash.MAX_DIGEST_SIZE]byte + + buf := options.input + algo := (^hash.Algorithm)(context.user_ptr)^ + digest := digest_[:hash.DIGEST_SIZES[algo]] + + for _ in 0 ..= options.rounds { + hash.hash_bytes_to_buffer(algo, buf, digest) + } + options.count = options.rounds + options.processed = options.rounds * (options.bytes) + + return +} diff --git a/tests/benchmark/crypto/benchmark_mac.odin b/tests/benchmark/crypto/benchmark_mac.odin new file mode 100644 index 000000000..a0d2cae90 --- /dev/null +++ b/tests/benchmark/crypto/benchmark_mac.odin @@ -0,0 +1,191 @@ +package benchmark_core_crypto + +import "base:runtime" +import "core:testing" +import "core:text/table" +import "core:time" + +import "core:crypto/hmac" +import "core:crypto/kmac" +import "core:crypto/poly1305" + +@(private = "file") +ITERS :: 10000 +@(private = "file") +SIZES := []int{64, 1024, 65536} +@(private = "file") +KMAC_KEY_SIZES := []int{128, 256} + +@(test) +benchmark_crypto_mac :: proc(t: ^testing.T) { + runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD() + + tbl: table.Table + table.init(&tbl) + defer table.destroy(&tbl) + + table.caption(&tbl, "MAC") + table.aligned_header_of_values(&tbl, .Right, "Algorithm", "Size", "Time", "Throughput") + + { + for sz, _ in SIZES { + options := &time.Benchmark_Options{ + rounds = ITERS, + bytes = sz, + setup = setup_sized_buf, + bench = do_bench_hmac_sha_256, + teardown = teardown_sized_buf, + } + + err := time.benchmark(options, context.allocator) + testing.expect(t, err == nil) + + time_per_iter := options.duration / ITERS + table.aligned_row_of_values( + &tbl, + .Right, + "HMAC-SHA256", + table.format(&tbl, "%d", sz), + table.format(&tbl, "%8M", time_per_iter), + table.format(&tbl, "%5.3f MiB/s", options.megabytes_per_second), + ) + } + } + + table.row(&tbl) + + for key_sz, i in KMAC_KEY_SIZES { + if i > 0 { + table.row(&tbl) + } + + for sz, _ in SIZES { + options := &time.Benchmark_Options{ + rounds = ITERS, + bytes = sz, + processed = key_sz, // Pls ignore. + setup = setup_sized_buf, + bench = do_bench_kmac, + teardown = teardown_sized_buf, + } + + err := time.benchmark(options, context.allocator) + testing.expect(t, err == nil) + + time_per_iter := options.duration / ITERS + table.aligned_row_of_values( + &tbl, + .Right, + table.format(&tbl, "KMAC%d", key_sz), + table.format(&tbl, "%d", sz), + table.format(&tbl, "%8M", time_per_iter), + table.format(&tbl, "%5.3f MiB/s", options.megabytes_per_second), + ) + } + } + + table.row(&tbl) + + { + for sz, _ in SIZES { + options := &time.Benchmark_Options{ + rounds = ITERS, + bytes = sz, + setup = setup_sized_buf, + bench = do_bench_poly1305, + teardown = teardown_sized_buf, + } + + err := time.benchmark(options, context.allocator) + testing.expect(t, err == nil) + + time_per_iter := options.duration / ITERS + table.aligned_row_of_values( + &tbl, + .Right, + "poly1305", + table.format(&tbl, "%d", sz), + table.format(&tbl, "%8M", time_per_iter), + table.format(&tbl, "%5.3f MiB/s", options.megabytes_per_second), + ) + } + } + + log_table(&tbl) +} + +@(private = "file") +do_bench_hmac_sha_256 :: proc( + options: ^time.Benchmark_Options, + allocator := context.allocator, +) -> ( + err: time.Benchmark_Error, +) { + buf := options.input + key := [32]byte { + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + } + + tag: [32]byte = --- + for _ in 0 ..= options.rounds { + hmac.sum(.SHA256, tag[:], buf, key[:]) + } + options.count = options.rounds + options.processed = options.rounds * options.bytes + + return +} + +@(private = "file") +do_bench_kmac :: proc( + options: ^time.Benchmark_Options, + allocator := context.allocator, +) -> ( + err: time.Benchmark_Error, +) { + buf := options.input + key := [kmac.MIN_KEY_SIZE_256]byte { + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + } + sec_strength := options.processed + + tag: [32]byte = --- + for _ in 0 ..= options.rounds { + kmac.sum(sec_strength, tag[:sec_strength/8], buf, key[:], nil) + } + options.count = options.rounds + options.processed = options.rounds * options.bytes + + return +} + +@(private = "file") +do_bench_poly1305 :: proc( + options: ^time.Benchmark_Options, + allocator := context.allocator, +) -> ( + err: time.Benchmark_Error, +) { + buf := options.input + key := [poly1305.KEY_SIZE]byte { + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + 0xde, 0xad, 0xbe, 0xef, 0xde, 0xad, 0xbe, 0xef, + } + + tag: [poly1305.TAG_SIZE]byte = --- + for _ in 0 ..= options.rounds { + poly1305.sum(tag[:], buf, key[:]) + } + options.count = options.rounds + options.processed = options.rounds * options.bytes + + return +} diff --git a/tests/benchmark/crypto/benchmark_stream.odin b/tests/benchmark/crypto/benchmark_stream.odin new file mode 100644 index 000000000..38c5a87c6 --- /dev/null +++ b/tests/benchmark/crypto/benchmark_stream.odin @@ -0,0 +1,145 @@ +package benchmark_core_crypto + +import "base:runtime" +import "core:crypto" +import "core:testing" +import "core:text/table" +import "core:time" + +import "core:crypto/aes" +import "core:crypto/chacha20" + +@(private = "file") +ITERS :: 10000 +@(private = "file") +SIZES := []int{64, 1024, 65536} +@(private = "file") +AES_CTR_KEY_SIZES := []int{128, 192, 256} + +@(test) +benchmark_crypto_stream :: proc(t: ^testing.T) { + runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD() + + tbl: table.Table + table.init(&tbl) + defer table.destroy(&tbl) + + table.caption(&tbl, "Stream Cipher") + table.aligned_header_of_values(&tbl, .Right, "Algorithm", "Size", "Time", "Throughput") + + for key_sz, i in AES_CTR_KEY_SIZES { + if i > 0 { + table.row(&tbl) + } + + key := make([]byte, key_sz/8, context.temp_allocator) + iv := make([]byte, aes.CTR_IV_SIZE, context.temp_allocator) + crypto.rand_bytes(key) + crypto.rand_bytes(iv) + + ctx: aes.Context_CTR + aes.init_ctr(&ctx, key, iv) + + for sz, _ in SIZES { + options := &time.Benchmark_Options{ + rounds = ITERS, + bytes = sz, + setup = setup_sized_buf, + bench = do_bench_aes_ctr, + teardown = teardown_sized_buf, + } + context.user_ptr = &ctx + + err := time.benchmark(options, context.allocator) + testing.expect(t, err == nil) + + time_per_iter := options.duration / ITERS + table.aligned_row_of_values( + &tbl, + .Right, + table.format(&tbl, "AES%d-CTR", key_sz), + table.format(&tbl, "%d", sz), + table.format(&tbl, "%8M", time_per_iter), + table.format(&tbl, "%5.3f MiB/s", options.megabytes_per_second), + ) + } + } + + table.row(&tbl) + + { + key := make([]byte, chacha20.KEY_SIZE, context.temp_allocator) + iv := make([]byte, chacha20.IV_SIZE, context.temp_allocator) + crypto.rand_bytes(key) + crypto.rand_bytes(iv) + + ctx: chacha20.Context + chacha20.init(&ctx, key, iv) + + for sz, _ in SIZES { + options := &time.Benchmark_Options{ + rounds = ITERS, + bytes = sz, + setup = setup_sized_buf, + bench = do_bench_chacha20, + teardown = teardown_sized_buf, + } + context.user_ptr = &ctx + + err := time.benchmark(options, context.allocator) + testing.expect(t, err == nil) + + time_per_iter := options.duration / ITERS + table.aligned_row_of_values( + &tbl, + .Right, + "chacha20", + table.format(&tbl, "%d", sz), + table.format(&tbl, "%8M", time_per_iter), + table.format(&tbl, "%5.3f MiB/s", options.megabytes_per_second), + ) + } + } + + log_table(&tbl) +} + +@(private = "file") +do_bench_aes_ctr :: proc( + options: ^time.Benchmark_Options, + allocator := context.allocator, +) -> ( + err: time.Benchmark_Error, +) { + ctx := (^aes.Context_CTR)(context.user_ptr) + + buf := options.input + + for _ in 0 ..= options.rounds { + aes.xor_bytes_ctr(ctx, buf, buf) + } + options.count = options.rounds + options.processed = options.rounds * options.bytes + + return +} + +@(private = "file") +do_bench_chacha20 :: proc( + options: ^time.Benchmark_Options, + allocator := context.allocator, +) -> ( + err: time.Benchmark_Error, +) { + ctx := (^chacha20.Context)(context.user_ptr) + + buf := options.input + + for _ in 0 ..= options.rounds { + chacha20.xor_bytes(ctx, buf, buf) + } + options.count = options.rounds + options.processed = options.rounds * options.bytes + + return +} diff --git a/tests/benchmark/crypto/benchmark_utils.odin b/tests/benchmark/crypto/benchmark_utils.odin new file mode 100644 index 000000000..6609adbf7 --- /dev/null +++ b/tests/benchmark/crypto/benchmark_utils.odin @@ -0,0 +1,50 @@ +package benchmark_core_crypto + +import "core:crypto" +import "core:fmt" +import "core:log" +import "core:strings" +import "core:text/table" +import "core:time" + +@(private) +log_table :: #force_inline proc(tbl: ^table.Table) { + sb := strings.builder_make() + defer strings.builder_destroy(&sb) + + wr := strings.to_writer(&sb) + + fmt.sbprintln(&sb) + table.write_plain_table(wr, tbl) + + log.info(strings.to_string(sb)) +} + +@(private) +setup_sized_buf :: proc( + options: ^time.Benchmark_Options, + allocator := context.allocator, +) -> ( + err: time.Benchmark_Error, +) { + assert(options != nil) + + options.input = make([]u8, options.bytes, allocator) + if len(options.input) > 0 { + crypto.rand_bytes(options.input) + } + return nil if len(options.input) == options.bytes else .Allocation_Error +} + +@(private) +teardown_sized_buf :: proc( + options: ^time.Benchmark_Options, + allocator := context.allocator, +) -> ( + err: time.Benchmark_Error, +) { + assert(options != nil) + + delete(options.input) + return nil +} diff --git a/tests/core/crypto/test_core_crypto_aead.odin b/tests/core/crypto/test_core_crypto_aead.odin index 90eedc0b2..961311cd6 100644 --- a/tests/core/crypto/test_core_crypto_aead.odin +++ b/tests/core/crypto/test_core_crypto_aead.odin @@ -1,7 +1,10 @@ package test_core_crypto import "base:runtime" +import "core:crypto/aes" +import "core:crypto/aegis" import "core:crypto/aead" +import "core:crypto/deoxysii" import "core:encoding/hex" import "core:testing" @@ -17,6 +20,14 @@ test_aead :: proc(t: ^testing.T) { for impl in supported_chacha_impls() { append(&chacha_impls, impl) } + aegis_impls := make([dynamic]aead.Implementation, context.temp_allocator) + for impl in supported_aegis_impls() { + append(&aegis_impls, impl) + } + deoxysii_impls := make([dynamic]aead.Implementation, context.temp_allocator) + for impl in supported_deoxysii_impls() { + append(&deoxysii_impls, impl) + } impls := [aead.Algorithm][dynamic]aead.Implementation{ .Invalid = nil, .AES_GCM_128 = aes_impls, @@ -24,6 +35,11 @@ test_aead :: proc(t: ^testing.T) { .AES_GCM_256 = aes_impls, .CHACHA20POLY1305 = chacha_impls, .XCHACHA20POLY1305 = chacha_impls, + .AEGIS_128L = aegis_impls, + .AEGIS_128L_256 = aegis_impls, + .AEGIS_256 = aegis_impls, + .AEGIS_256_256 = aegis_impls, + .DEOXYS_II_256 = deoxysii_impls, } test_vectors := []struct{ @@ -224,6 +240,263 @@ test_aead :: proc(t: ^testing.T) { "bd6d179d3e83d43b9576579493c0e939572a1700252bfaccbed2902c21396cbb731c7f1b0b4aa6440bf3a82f4eda7e39ae64c6708c54c216cb96b72e1213b4522f8c9ba40db5d945b11b69b982c1bb9e3f3fac2bc369488f76b2383565d3fff921f9664c97637da9768812f615c68b13b52e", "c0875924c1c7987947deafd8780acf49", }, + // AEGIS-128L + // https://www.ietf.org/archive/id/draft-irtf-cfrg-aegis-aead-11.txt + { + .AEGIS_128L, + "10010000000000000000000000000000", + "10000200000000000000000000000000", + "", + "00000000000000000000000000000000", + "c1c0e58bd913006feba00f4b3cc3594e", + "abe0ece80c24868a226a35d16bdae37a", + }, + { + .AEGIS_128L_256, + "10010000000000000000000000000000", + "10000200000000000000000000000000", + "", + "00000000000000000000000000000000", + "c1c0e58bd913006feba00f4b3cc3594e", + "25835bfbb21632176cf03840687cb968cace4617af1bd0f7d064c639a5c79ee4", + }, + { + .AEGIS_128L, + "10010000000000000000000000000000", + "10000200000000000000000000000000", + "", + "", + "", + "c2b879a67def9d74e6c14f708bbcc9b4", + }, + { + .AEGIS_128L_256, + "10010000000000000000000000000000", + "10000200000000000000000000000000", + "", + "", + "", + "1360dc9db8ae42455f6e5b6a9d488ea4f2184c4e12120249335c4ee84bafe25d", + }, + { + .AEGIS_128L, + "10010000000000000000000000000000", + "10000200000000000000000000000000", + "0001020304050607", + "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f", + "79d94593d8c2119d7e8fd9b8fc77845c5c077a05b2528b6ac54b563aed8efe84", + "cc6f3372f6aa1bb82388d695c3962d9a", + }, + { + .AEGIS_128L_256, + "10010000000000000000000000000000", + "10000200000000000000000000000000", + "0001020304050607", + "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f", + "79d94593d8c2119d7e8fd9b8fc77845c5c077a05b2528b6ac54b563aed8efe84", + "022cb796fe7e0ae1197525ff67e309484cfbab6528ddef89f17d74ef8ecd82b3", + }, + { + .AEGIS_128L, + "10010000000000000000000000000000", + "10000200000000000000000000000000", + "0001020304050607", + "000102030405060708090a0b0c0d", + "79d94593d8c2119d7e8fd9b8fc77", + "5c04b3dba849b2701effbe32c7f0fab7", + }, + { + .AEGIS_128L_256, + "10010000000000000000000000000000", + "10000200000000000000000000000000", + "0001020304050607", + "000102030405060708090a0b0c0d", + "79d94593d8c2119d7e8fd9b8fc77", + "86f1b80bfb463aba711d15405d094baf4a55a15dbfec81a76f35ed0b9c8b04ac", + }, + { + .AEGIS_128L, + "10010000000000000000000000000000", + "10000200000000000000000000000000", + "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f20212223242526272829", + "101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f3031323334353637", + "b31052ad1cca4e291abcf2df3502e6bdb1bfd6db36798be3607b1f94d34478aa7ede7f7a990fec10", + "7542a745733014f9474417b337399507", + }, + { + .AEGIS_128L_256, + "10010000000000000000000000000000", + "10000200000000000000000000000000", + "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f20212223242526272829", + "101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f3031323334353637", + "b31052ad1cca4e291abcf2df3502e6bdb1bfd6db36798be3607b1f94d34478aa7ede7f7a990fec10", + "b91e2947a33da8bee89b6794e647baf0fc835ff574aca3fc27c33be0db2aff98", + }, + // AEGIS-256 + // https://www.ietf.org/archive/id/draft-irtf-cfrg-aegis-aead-11.txt + { + .AEGIS_256, + "1001000000000000000000000000000000000000000000000000000000000000", + "1000020000000000000000000000000000000000000000000000000000000000", + "", + "00000000000000000000000000000000", + "754fc3d8c973246dcc6d741412a4b236", + "3fe91994768b332ed7f570a19ec5896e", + }, + { + .AEGIS_256_256, + "1001000000000000000000000000000000000000000000000000000000000000", + "1000020000000000000000000000000000000000000000000000000000000000", + "", + "00000000000000000000000000000000", + "754fc3d8c973246dcc6d741412a4b236", + "1181a1d18091082bf0266f66297d167d2e68b845f61a3b0527d31fc7b7b89f13", + }, + { + .AEGIS_256, + "1001000000000000000000000000000000000000000000000000000000000000", + "1000020000000000000000000000000000000000000000000000000000000000", + "", + "", + "", + "e3def978a0f054afd1e761d7553afba3", + }, + { + .AEGIS_256_256, + "1001000000000000000000000000000000000000000000000000000000000000", + "1000020000000000000000000000000000000000000000000000000000000000", + "", + "", + "", + "6a348c930adbd654896e1666aad67de989ea75ebaa2b82fb588977b1ffec864a", + }, + { + .AEGIS_256, + "1001000000000000000000000000000000000000000000000000000000000000", + "1000020000000000000000000000000000000000000000000000000000000000", + "0001020304050607", + "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f", + "f373079ed84b2709faee373584585d60accd191db310ef5d8b11833df9dec711", + "8d86f91ee606e9ff26a01b64ccbdd91d", + }, + { + .AEGIS_256_256, + "1001000000000000000000000000000000000000000000000000000000000000", + "1000020000000000000000000000000000000000000000000000000000000000", + "0001020304050607", + "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f", + "f373079ed84b2709faee373584585d60accd191db310ef5d8b11833df9dec711", + "b7d28d0c3c0ebd409fd22b44160503073a547412da0854bfb9723020dab8da1a", + }, + { + .AEGIS_256, + "1001000000000000000000000000000000000000000000000000000000000000", + "1000020000000000000000000000000000000000000000000000000000000000", + "0001020304050607", + "000102030405060708090a0b0c0d", + "f373079ed84b2709faee37358458", + "c60b9c2d33ceb058f96e6dd03c215652", + }, + { + .AEGIS_256_256, + "1001000000000000000000000000000000000000000000000000000000000000", + "1000020000000000000000000000000000000000000000000000000000000000", + "0001020304050607", + "000102030405060708090a0b0c0d", + "f373079ed84b2709faee37358458", + "8c1cc703c81281bee3f6d9966e14948b4a175b2efbdc31e61a98b4465235c2d9", + }, + { + .AEGIS_256, + "1001000000000000000000000000000000000000000000000000000000000000", + "1000020000000000000000000000000000000000000000000000000000000000", + "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f20212223242526272829", + "101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f3031323334353637", + "57754a7d09963e7c787583a2e7b859bb24fa1e04d49fd550b2511a358e3bca252a9b1b8b30cc4a67", + "ab8a7d53fd0e98d727accca94925e128", + }, + { + .AEGIS_256_256, + "1001000000000000000000000000000000000000000000000000000000000000", + "1000020000000000000000000000000000000000000000000000000000000000", + "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f20212223242526272829", + "101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f3031323334353637", + "57754a7d09963e7c787583a2e7b859bb24fa1e04d49fd550b2511a358e3bca252a9b1b8b30cc4a67", + "a3aca270c006094d71c20e6910b5161c0826df233d08919a566ec2c05990f734", + }, + // Deoxys-II-256 + { + .DEOXYS_II_256, + "101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f", + "202122232425262728292a2b2c2d2e", + "", + "", + "", + "2b97bd77712f0cde975309959dfe1d7c", + }, + { + .DEOXYS_II_256, + "101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f", + "202122232425262728292a2b2c2d2e", + "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f", + "", + "", + "54708ae5565a71f147bdb94d7ba3aed7", + }, + { + .DEOXYS_II_256, + "101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f", + "202122232425262728292a2b2c2d2e", + "f495c9c03d29989695d98ff5d430650125805c1e0576d06f26cbda42b1f82238b8", + "", + "", + "3277689dc4208cc1ff59d15434a1baf1", + }, + { + .DEOXYS_II_256, + "101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f", + "202122232425262728292a2b2c2d2e", + "", + "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f", + "9da20db1c2781f6669257d87e2a4d9be1970f7581bef2c995e1149331e5e8cc1", + "92ce3aec3a4b72ff9eab71c2a93492fa", + }, + { + .DEOXYS_II_256, + "101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f", + "202122232425262728292a2b2c2d2e", + "", + "15cd77732f9d0c4c6e581ef400876ad9188c5b8850ebd38224da95d7cdc99f7acc", + "e5ffd2abc5b459a73667756eda6443ede86c0883fc51dd75d22bb14992c684618c", + "5fa78d57308f19d0252072ee39df5ecc", + }, + { + .DEOXYS_II_256, + "101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f", + "202122232425262728292a2b2c2d2e", + "000102030405060708090a0b0c0d0e0f", + "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f", + "109f8a168b36dfade02628a9e129d5257f03cc7912aefa79729b67b186a2b08f", + "6549f9bf10acba0a451dbb2484a60d90", + }, + { + .DEOXYS_II_256, + "101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f", + "202122232425262728292a2b2c2d2e", + "000102030405060708090a0b0c0d0e0f10", + "422857fb165af0a35c03199fb895604dca9cea6d788954962c419e0d5c225c0327", + "7d772203fa38be296d8d20d805163130c69aba8cb16ed845c2296c61a8f34b394e", + "0b3f10e3933c78190b24b33008bf80e9", + }, + { + .DEOXYS_II_256, + "101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f", + "202122232425262728292a2b2c2d2e", + "3290bb8441279dc6083a43e9048c3dc08966ab30d7a6b35759e7a13339f124918f3b5ab1affa65e6c0e3680eb33a6ec82424ab1ce5a40b8654e13d845c29b13896a1466a75fc875acba4527ded37ed00c600a357c9a6e586c74cf3d85cd3258c813218f319d12b82480e5124ff19ec00bda1fbb8bd25eeb3de9fcbf3296deba250caf7e9f4ef0be1918e24221dd0be888c59c166ad761d7b58462a1b1d44b04265b45827172c133dd5b6c870b9af7b21368d12a88f4efa1751047543d584382d9ec22e7550d50ecddba27d1f65453f1f3398de54ee8c1f4ac8e16f5523d89641e99a632380af0f0b1e6b0e192ec29bf1d8714978ff9fbfb93604142393e9a82c3aaebbbe15e3b4e5cfd18bdfe309315c9f9f830deebe2edcdc24f8eca90fda49f6646e789c5041fb5be933fa843278e95f3a54f8eb41f14777ea949d5ea442b01249e64816151a325769e264ed4acd5c3f21700ca755d5bc0c2c5f9453419510bc74f2d71621dcecb9efc9c24791b4bb560fb70a8231521d6560af89d8d50144d9c080863f043781153bcd59030e60bd17a6d7aa083211b67b581fa4f74cce4d030d1e8f9429fd725c110040d41eb6989ffb1595c72cbe3c9b78a8ab80d71a6a5283da77b89cae295bb13c14fbe466b617f4da8ad60b085e2ea153f6713ae0046aa31e0ba44e43ef36a111bf05c073a4e3624cd35f63a546f9142b35aa81b8826d", + "83dab23b1379e090755c99079cfe918cb737e989f2d720ccaff493a744927644fec3653211fa75306a83486e5c34ecfe63870c97251a73e4b9033ae374809711b211ed5d293a592e466a81170f1d85750b5ca025ccd4579947edbae9ec132bfb1a7233ad79fae30006a6699f143893861b975226ed9d3cfb8a240be232fbf4e83755d59d20bc2faa2ea5e5b0428427485cca5e76a89fe32bdd59ab4177ad7cb1899c101e3c4f7535129591390ebdf30140846078b13867bbb2efd6cf434afe356eb18d716b21fd664c26c908496534bf2cde6d6b897799016594fb6d9f830ae5f44ccec26d42ff0d1a21b80cdbe8c8c170a5f766fad884abcc781b5b8ebc0f559bfeaa4557b04d977d51411a7f47bf437d0280cf9f92bc4f9cd6226337a492320851955adae2cafea22a89c3132dd252e4728328eda05555dff3241404341b8aa502d45c456113af42a8e91a85e4b4e9555028982ec3d144722af0eb04a6d3b8127c3040629de53f5fd187048198e8f8e8cc857afcbae45c693fec12fc2149d5e7587d0121b1717d0147f6979f75e8f085293f705c3399a6cc8df7057bf481e6c374edf0a0af7479f858045357b7fe21021c3fabdaf012652bf2e5db257bd9490ce637a81477bd3f9814a2198fdb9afa9344321f2393798670e588c47a1924d592cda3eb5a96754dfd92d87ee1ffa9d4ee586c85d7518c5d2db57d0451c33de0", + "88294fcef65a1bdfd7baaa472816c64ef5bef2622b88c1ec5a739396157ef4935f3aa76449e391c32da28ee2857f399ac3dd95aed30cfb26cc0063cd4cd8f7431108176fbf370123856662b000a8348e5925fbb97c9ec0c737758330a7983f06b51590c1d2f5e5faaf0eb58e34e19e5fc85cec03d3926dd46a79ba7026e83dec24e07484c9103dd0cdb0edb505500caca5e1d5dbc71348cf00648821488ebaab7f9d84bbbf91b3c521dbef30110e7bd94f8dad5ab8e0cc5411ca9682d210d5d80c0c4bdbba8181789a4273d6deb80899fdcd976ca6f3a9770b54305f586a04256cfbeb4c11254e88559f294db3b9a94b80ab9f9a02cb4c0748de0af7818685521691dba5738be546dba13a56016fb8635af9dff50f25d1b17ad21707db2640a76a741e65e559b2afaaec0f37e18436bf02008f84dbd7b2698687a22376b65dc7524fca8a28709eee3f3caee3b28ed1173d1e08ee849e2ca63d2c90d555755c8fbafd5d2f4b37f06a1dbd6852ee2ffcfe79d510152e98fc4f3094f740a4aede9ee378b606d34576776bf5f1269f5385a84b3928433bfca177550ccfcd22cd0331bbc595e38c2758b2662476fa66354c4e84c7b360405aa3f5b2a48621bdca1a90c69b21789c91b5b8c568e3c741d99e22f6d7e26f2abed045f1d578b782ab4a5cf2af636d842b3012e180e4b045d8d15b057b69c92398a517053daf9be7c2935e", + "a616f0c218e18b526cf2a3f8c115e262", + }, } for v, _ in test_vectors { algo_name := aead.ALGORITHM_NAMES[v.algo] @@ -337,3 +610,23 @@ test_aead :: proc(t: ^testing.T) { } } } + +supported_aegis_impls :: proc() -> [dynamic]aes.Implementation { + impls := make([dynamic]aes.Implementation, 0, 2, context.temp_allocator) + append(&impls, aes.Implementation.Portable) + if aegis.is_hardware_accelerated() { + append(&impls, aes.Implementation.Hardware) + } + + return impls +} + +supported_deoxysii_impls :: proc() -> [dynamic]aes.Implementation { + impls := make([dynamic]aes.Implementation, 0, 2, context.temp_allocator) + append(&impls, aes.Implementation.Portable) + if deoxysii.is_hardware_accelerated() { + append(&impls, aes.Implementation.Hardware) + } + + return impls +} diff --git a/tests/core/crypto/test_core_crypto_ecc25519.odin b/tests/core/crypto/test_core_crypto_edwards.odin similarity index 91% rename from tests/core/crypto/test_core_crypto_ecc25519.odin rename to tests/core/crypto/test_core_crypto_edwards.odin index fec4fa38e..61933c00f 100644 --- a/tests/core/crypto/test_core_crypto_ecc25519.odin +++ b/tests/core/crypto/test_core_crypto_edwards.odin @@ -7,6 +7,7 @@ import field "core:crypto/_fiat/field_curve25519" import "core:crypto/ed25519" import "core:crypto/ristretto255" import "core:crypto/x25519" +import "core:crypto/x448" @(test) test_sqrt_ratio_m1 :: proc(t: ^testing.T) { @@ -684,6 +685,68 @@ test_x25519 :: proc(t: ^testing.T) { } } +@(test) +test_x448 :: proc(t: ^testing.T) { + // Local copy of this so that the base point doesn't need to be exported. + _BASE_POINT: [56]byte = { + 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + } + + test_vectors := []struct { + scalar: string, + point: string, + product: string, + } { + // Test vectors from RFC 7748 + { + "3d262fddf9ec8e88495266fea19a34d28882acef045104d0d1aae121700a779c984c24f8cdd78fbff44943eba368f54b29259a4f1c600ad3", + "06fce640fa3487bfda5f6cf2d5263f8aad88334cbd07437f020f08f9814dc031ddbdc38c19c6da2583fa5429db94ada18aa7a7fb4ef8a086", + "ce3e4ff95a60dc6697da1db1d85e6afbdf79b50a2412d7546d5f239fe14fbaadeb445fc66a01b0779d98223961111e21766282f73dd96b6f", + }, + { + "203d494428b8399352665ddca42f9de8fef600908e0d461cb021f8c538345dd77c3e4806e25f46d3315c44e0a5b4371282dd2c8d5be3095f", + "0fbcc2f993cd56d3305b0b7d9e55d4c1a8fb5dbb52f8e9a1e9b6201b165d015894e56c4d3570bee52fe205e28a78b91cdfbde71ce8d157db", + "884a02576239ff7a2f2f63b2db6a9ff37047ac13568e1e30fe63c4a7ad1b3ee3a5700df34321d62077e63633c575c1c954514e99da7c179d", + }, + } + for v, _ in test_vectors { + scalar, _ := hex.decode(transmute([]byte)(v.scalar), context.temp_allocator) + point, _ := hex.decode(transmute([]byte)(v.point), context.temp_allocator) + + derived_point: [x448.POINT_SIZE]byte + x448.scalarmult(derived_point[:], scalar[:], point[:]) + derived_point_str := string(hex.encode(derived_point[:], context.temp_allocator)) + + testing.expectf( + t, + derived_point_str == v.product, + "Expected %s for %s * %s, but got %s instead", + v.product, + v.scalar, + v.point, + derived_point_str, + ) + + // Abuse the test vectors to sanity-check the scalar-basepoint multiply. + p1, p2: [x448.POINT_SIZE]byte + x448.scalarmult_basepoint(p1[:], scalar[:]) + x448.scalarmult(p2[:], scalar[:], _BASE_POINT[:]) + p1_str := string(hex.encode(p1[:], context.temp_allocator)) + p2_str := string(hex.encode(p2[:], context.temp_allocator)) + testing.expectf( + t, + p1_str == p2_str, + "Expected %s for %s * basepoint, but got %s instead", + p2_str, + v.scalar, + p1_str, + ) + } +} + @(private) ge_str :: proc(ge: ^ristretto255.Group_Element) -> string { b: [ristretto255.ELEMENT_SIZE]byte