core/crypto: Add x25519

This package implements the X25519 key agreement scheme as specified in
RFC 7748, using routines taken from fiat-crypto and Monocypher.
This commit is contained in:
Yawning Angel
2021-11-06 04:21:24 +00:00
parent d1e76ee4f2
commit 1a7a6a9116
7 changed files with 1039 additions and 0 deletions
+35
View File
@@ -0,0 +1,35 @@
# fiat
This package contains low level arithmetic required to implement certain
cryptographic primitives, ported from the [fiat-crypto project][1]
along with some higher-level helpers.
## Notes
fiat-crypto gives the choice of 3 licenses for derived works. The 1-Clause
BSD license is chosen as it is compatible with Odin's existing licensing.
The routines are intended to be timing-safe, as long as the underlying
integer arithmetic is constant time. This is true on most systems commonly
used today, with the notable exception of WASM.
While fiat-crypto provides both output targeting both 32-bit and 64-bit
architectures, only the 64-bit versions were used, as 32-bit architectures
are becoming increasingly uncommon and irrelevant.
With the current Odin syntax, the Go output is trivially ported in most
cases and was used as the basis of the port.
In the future, it would be better to auto-generate Odin either directly
by adding an appropriate code-gen backend written in Coq, or perhaps by
parsing the JSON output.
As this is a port rather than autogenerated output, none of fiat-crypto's
formal verification guarantees apply, unless it is possible to prove binary
equivalence.
For the most part, alterations to the base fiat-crypto generated code was
kept to a minimum, to aid auditability. This results in a somewhat
ideosyncratic style, and in some cases minor performance penalties.
[1]: https://github.com/mit-plv/fiat-crypto
+24
View File
@@ -0,0 +1,24 @@
package fiat
// This package provides various helpers and types common to all of the
// fiat-crypto derived backends.
// This code only works on a two's complement system.
#assert((-1 & 3) == 3)
u1 :: distinct u8
i1 :: distinct i8
cmovznz_u64 :: #force_inline proc "contextless" (arg1: u1, arg2, arg3: u64) -> (out1: u64) {
x1 := (u64(arg1) * 0xffffffffffffffff)
x2 := ((x1 & arg3) | ((~x1) & arg2))
out1 = x2
return
}
cmovznz_u32 :: #force_inline proc "contextless" (arg1: u1, arg2, arg3: u32) -> (out1: u32) {
x1 := (u32(arg1) * 0xffffffff)
x2 := ((x1 & arg3) | ((~x1) & arg2))
out1 = x2
return
}
@@ -0,0 +1,138 @@
package field_curve25519
import "core:crypto"
import "core:mem"
fe_relax_cast :: #force_inline proc "contextless" (arg1: ^Tight_Field_Element) -> ^Loose_Field_Element {
return transmute(^Loose_Field_Element)(arg1)
}
fe_tighten_cast :: #force_inline proc "contextless" (arg1: ^Loose_Field_Element) -> ^Tight_Field_Element {
return transmute(^Tight_Field_Element)(arg1)
}
fe_from_bytes :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^[32]byte) {
// Ignore the unused bit by copying the input and masking the bit off
// prior to deserialization.
tmp1: [32]byte = ---
copy_slice(tmp1[:], arg1[:])
tmp1[31] &= 127
_fe_from_bytes(out1, &tmp1)
mem.zero_explicit(&tmp1, size_of(tmp1))
}
fe_equal :: proc "contextless" (arg1, arg2: ^Tight_Field_Element) -> int {
tmp2: [32]byte = ---
fe_to_bytes(&tmp2, arg2)
ret := fe_equal_bytes(arg1, &tmp2)
mem.zero_explicit(&tmp2, size_of(tmp2))
return ret
}
fe_equal_bytes :: proc "contextless" (arg1: ^Tight_Field_Element, arg2: ^[32]byte) -> int {
tmp1: [32]byte = ---
fe_to_bytes(&tmp1, arg1)
ret := crypto.compare_constant_time(tmp1[:], arg2[:])
mem.zero_explicit(&tmp1, size_of(tmp1))
return ret
}
fe_carry_pow2k :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element, arg2: uint) {
// Special case: `arg1^(2 * 0) = 1`, though this should never happen.
if arg2 == 0 {
fe_one(out1)
return
}
fe_carry_square(out1, arg1)
for _ in 1..<arg2 {
fe_carry_square(out1, fe_relax_cast(out1))
}
}
fe_carry_opp :: #force_inline proc "contextless" (out1, arg1: ^Tight_Field_Element) {
fe_opp(fe_relax_cast(out1), arg1)
fe_carry(out1, fe_relax_cast(out1))
}
fe_carry_invsqrt :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) -> int {
// Inverse square root taken from Monocypher.
tmp1, tmp2, tmp3: Tight_Field_Element = ---, ---, ---
// t0 = x^((p-5)/8)
// Can be achieved with a simple double & add ladder,
// but it would be slower.
fe_carry_pow2k(&tmp1, arg1, 1)
fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp1), 2)
fe_carry_mul(&tmp2, arg1, fe_relax_cast(&tmp2))
fe_carry_mul(&tmp1, fe_relax_cast(&tmp1), fe_relax_cast(&tmp2))
fe_carry_pow2k(&tmp1, fe_relax_cast(&tmp1), 1)
fe_carry_mul(&tmp1, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1))
fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp1), 5)
fe_carry_mul(&tmp1, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1))
fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp1), 10)
fe_carry_mul(&tmp2, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1))
fe_carry_pow2k(&tmp3, fe_relax_cast(&tmp2), 20)
fe_carry_mul(&tmp2, fe_relax_cast(&tmp3), fe_relax_cast(&tmp2))
fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp2), 10)
fe_carry_mul(&tmp1, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1))
fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp1), 50)
fe_carry_mul(&tmp2, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1))
fe_carry_pow2k(&tmp3, fe_relax_cast(&tmp2), 100)
fe_carry_mul(&tmp2, fe_relax_cast(&tmp3), fe_relax_cast(&tmp2))
fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp2), 50)
fe_carry_mul(&tmp1, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1))
fe_carry_pow2k(&tmp1, fe_relax_cast(&tmp1), 2)
fe_carry_mul(&tmp1, fe_relax_cast(&tmp1), arg1)
// quartic = x^((p-1)/4)
quartic := &tmp2
fe_carry_square(quartic, fe_relax_cast(&tmp1))
fe_carry_mul(quartic, fe_relax_cast(quartic), arg1)
// Serialize quartic once to save on repeated serialization/sanitization.
quartic_buf: [32]byte = ---
fe_to_bytes(&quartic_buf, quartic)
check := &tmp3
fe_one(check)
p1 := fe_equal_bytes(check, &quartic_buf)
fe_carry_opp(check, check)
m1 := fe_equal_bytes(check, &quartic_buf)
fe_carry_opp(check, &SQRT_M1)
ms := fe_equal_bytes(check, &quartic_buf)
// if quartic == -1 or sqrt(-1)
// then isr = x^((p-1)/4) * sqrt(-1)
// else isr = x^((p-1)/4)
fe_carry_mul(out1, fe_relax_cast(&tmp1), fe_relax_cast(&SQRT_M1))
fe_cond_assign(out1, &tmp1, (m1|ms) ~ 1)
mem.zero_explicit(&tmp1, size_of(tmp1))
mem.zero_explicit(&tmp2, size_of(tmp2))
mem.zero_explicit(&tmp3, size_of(tmp3))
mem.zero_explicit(&quartic_buf, size_of(quartic_buf))
return p1 | m1
}
fe_carry_inv :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) {
tmp1: Tight_Field_Element
fe_carry_square(&tmp1, arg1)
_ = fe_carry_invsqrt(&tmp1, fe_relax_cast(&tmp1))
fe_carry_square(&tmp1, fe_relax_cast(&tmp1))
fe_carry_mul(out1, fe_relax_cast(&tmp1), arg1)
mem.zero_explicit(&tmp1, size_of(tmp1))
}
@@ -0,0 +1,616 @@
// The BSD 1-Clause License (BSD-1-Clause)
//
// Copyright (c) 2015-2020 the fiat-crypto authors (see the AUTHORS file)
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// THIS SOFTWARE IS PROVIDED BY the fiat-crypto authors "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Berkeley Software Design,
// Inc. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
package field_curve25519
// The file provides arithmetic on the field Z/(2^255-19) using
// unsaturated 64-bit integer arithmetic. It is derived primarily
// from the machine generated Golang output from the fiat-crypto project.
//
// While the base implementation is provably correct, this implementation
// makes no such claims as the port and optimizations were done by hand.
// At some point, it may be worth adding support to fiat-crypto for
// generating Odin output.
//
// TODO:
// * When fiat-crypto supports it, using a saturated 64-bit limbs
// instead of 51-bit limbs will be faster, though the gains are
// minimal unless adcx/adox/mulx are used.
import fiat "core:crypto/_fiat"
import "core:math/bits"
Loose_Field_Element :: distinct [5]u64
Tight_Field_Element :: distinct [5]u64
SQRT_M1 := Tight_Field_Element{
1718705420411056,
234908883556509,
2233514472574048,
2117202627021982,
765476049583133,
}
_addcarryx_u51 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u64) -> (out1: u64, out2: fiat.u1) {
x1 := ((u64(arg1) + arg2) + arg3)
x2 := (x1 & 0x7ffffffffffff)
x3 := fiat.u1((x1 >> 51))
out1 = x2
out2 = x3
return
}
_subborrowx_u51 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u64) -> (out1: u64, out2: fiat.u1) {
x1 := ((i64(arg2) - i64(arg1)) - i64(arg3))
x2 := fiat.i1((x1 >> 51))
x3 := (u64(x1) & 0x7ffffffffffff)
out1 = x3
out2 = (0x0 - fiat.u1(x2))
return
}
fe_carry_mul :: proc (out1: ^Tight_Field_Element, arg1, arg2: ^Loose_Field_Element) {
x2, x1 := bits.mul_u64(arg1[4], (arg2[4] * 0x13))
x4, x3 := bits.mul_u64(arg1[4], (arg2[3] * 0x13))
x6, x5 := bits.mul_u64(arg1[4], (arg2[2] * 0x13))
x8, x7 := bits.mul_u64(arg1[4], (arg2[1] * 0x13))
x10, x9 := bits.mul_u64(arg1[3], (arg2[4] * 0x13))
x12, x11 := bits.mul_u64(arg1[3], (arg2[3] * 0x13))
x14, x13 := bits.mul_u64(arg1[3], (arg2[2] * 0x13))
x16, x15 := bits.mul_u64(arg1[2], (arg2[4] * 0x13))
x18, x17 := bits.mul_u64(arg1[2], (arg2[3] * 0x13))
x20, x19 := bits.mul_u64(arg1[1], (arg2[4] * 0x13))
x22, x21 := bits.mul_u64(arg1[4], arg2[0])
x24, x23 := bits.mul_u64(arg1[3], arg2[1])
x26, x25 := bits.mul_u64(arg1[3], arg2[0])
x28, x27 := bits.mul_u64(arg1[2], arg2[2])
x30, x29 := bits.mul_u64(arg1[2], arg2[1])
x32, x31 := bits.mul_u64(arg1[2], arg2[0])
x34, x33 := bits.mul_u64(arg1[1], arg2[3])
x36, x35 := bits.mul_u64(arg1[1], arg2[2])
x38, x37 := bits.mul_u64(arg1[1], arg2[1])
x40, x39 := bits.mul_u64(arg1[1], arg2[0])
x42, x41 := bits.mul_u64(arg1[0], arg2[4])
x44, x43 := bits.mul_u64(arg1[0], arg2[3])
x46, x45 := bits.mul_u64(arg1[0], arg2[2])
x48, x47 := bits.mul_u64(arg1[0], arg2[1])
x50, x49 := bits.mul_u64(arg1[0], arg2[0])
x51, x52 := bits.add_u64(x13, x7, u64(0x0))
x53, _ := bits.add_u64(x14, x8, u64(fiat.u1(x52)))
x55, x56 := bits.add_u64(x17, x51, u64(0x0))
x57, _ := bits.add_u64(x18, x53, u64(fiat.u1(x56)))
x59, x60 := bits.add_u64(x19, x55, u64(0x0))
x61, _ := bits.add_u64(x20, x57, u64(fiat.u1(x60)))
x63, x64 := bits.add_u64(x49, x59, u64(0x0))
x65, _ := bits.add_u64(x50, x61, u64(fiat.u1(x64)))
x67 := ((x63 >> 51) | ((x65 << 13) & 0xffffffffffffffff))
x68 := (x63 & 0x7ffffffffffff)
x69, x70 := bits.add_u64(x23, x21, u64(0x0))
x71, _ := bits.add_u64(x24, x22, u64(fiat.u1(x70)))
x73, x74 := bits.add_u64(x27, x69, u64(0x0))
x75, _ := bits.add_u64(x28, x71, u64(fiat.u1(x74)))
x77, x78 := bits.add_u64(x33, x73, u64(0x0))
x79, _ := bits.add_u64(x34, x75, u64(fiat.u1(x78)))
x81, x82 := bits.add_u64(x41, x77, u64(0x0))
x83, _ := bits.add_u64(x42, x79, u64(fiat.u1(x82)))
x85, x86 := bits.add_u64(x25, x1, u64(0x0))
x87, _ := bits.add_u64(x26, x2, u64(fiat.u1(x86)))
x89, x90 := bits.add_u64(x29, x85, u64(0x0))
x91, _ := bits.add_u64(x30, x87, u64(fiat.u1(x90)))
x93, x94 := bits.add_u64(x35, x89, u64(0x0))
x95, _ := bits.add_u64(x36, x91, u64(fiat.u1(x94)))
x97, x98 := bits.add_u64(x43, x93, u64(0x0))
x99, _ := bits.add_u64(x44, x95, u64(fiat.u1(x98)))
x101, x102 := bits.add_u64(x9, x3, u64(0x0))
x103, _ := bits.add_u64(x10, x4, u64(fiat.u1(x102)))
x105, x106 := bits.add_u64(x31, x101, u64(0x0))
x107, _ := bits.add_u64(x32, x103, u64(fiat.u1(x106)))
x109, x110 := bits.add_u64(x37, x105, u64(0x0))
x111, _ := bits.add_u64(x38, x107, u64(fiat.u1(x110)))
x113, x114 := bits.add_u64(x45, x109, u64(0x0))
x115, _ := bits.add_u64(x46, x111, u64(fiat.u1(x114)))
x117, x118 := bits.add_u64(x11, x5, u64(0x0))
x119, _ := bits.add_u64(x12, x6, u64(fiat.u1(x118)))
x121, x122 := bits.add_u64(x15, x117, u64(0x0))
x123, _ := bits.add_u64(x16, x119, u64(fiat.u1(x122)))
x125, x126 := bits.add_u64(x39, x121, u64(0x0))
x127, _ := bits.add_u64(x40, x123, u64(fiat.u1(x126)))
x129, x130 := bits.add_u64(x47, x125, u64(0x0))
x131, _ := bits.add_u64(x48, x127, u64(fiat.u1(x130)))
x133, x134 := bits.add_u64(x67, x129, u64(0x0))
x135 := (u64(fiat.u1(x134)) + x131)
x136 := ((x133 >> 51) | ((x135 << 13) & 0xffffffffffffffff))
x137 := (x133 & 0x7ffffffffffff)
x138, x139 := bits.add_u64(x136, x113, u64(0x0))
x140 := (u64(fiat.u1(x139)) + x115)
x141 := ((x138 >> 51) | ((x140 << 13) & 0xffffffffffffffff))
x142 := (x138 & 0x7ffffffffffff)
x143, x144 := bits.add_u64(x141, x97, u64(0x0))
x145 := (u64(fiat.u1(x144)) + x99)
x146 := ((x143 >> 51) | ((x145 << 13) & 0xffffffffffffffff))
x147 := (x143 & 0x7ffffffffffff)
x148, x149 := bits.add_u64(x146, x81, u64(0x0))
x150 := (u64(fiat.u1(x149)) + x83)
x151 := ((x148 >> 51) | ((x150 << 13) & 0xffffffffffffffff))
x152 := (x148 & 0x7ffffffffffff)
x153 := (x151 * 0x13)
x154 := (x68 + x153)
x155 := (x154 >> 51)
x156 := (x154 & 0x7ffffffffffff)
x157 := (x155 + x137)
x158 := fiat.u1((x157 >> 51))
x159 := (x157 & 0x7ffffffffffff)
x160 := (u64(x158) + x142)
out1[0] = x156
out1[1] = x159
out1[2] = x160
out1[3] = x147
out1[4] = x152
}
fe_carry_square :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) {
x1 := (arg1[4] * 0x13)
x2 := (x1 * 0x2)
x3 := (arg1[4] * 0x2)
x4 := (arg1[3] * 0x13)
x5 := (x4 * 0x2)
x6 := (arg1[3] * 0x2)
x7 := (arg1[2] * 0x2)
x8 := (arg1[1] * 0x2)
x10, x9 := bits.mul_u64(arg1[4], x1)
x12, x11 := bits.mul_u64(arg1[3], x2)
x14, x13 := bits.mul_u64(arg1[3], x4)
x16, x15 := bits.mul_u64(arg1[2], x2)
x18, x17 := bits.mul_u64(arg1[2], x5)
x20, x19 := bits.mul_u64(arg1[2], arg1[2])
x22, x21 := bits.mul_u64(arg1[1], x2)
x24, x23 := bits.mul_u64(arg1[1], x6)
x26, x25 := bits.mul_u64(arg1[1], x7)
x28, x27 := bits.mul_u64(arg1[1], arg1[1])
x30, x29 := bits.mul_u64(arg1[0], x3)
x32, x31 := bits.mul_u64(arg1[0], x6)
x34, x33 := bits.mul_u64(arg1[0], x7)
x36, x35 := bits.mul_u64(arg1[0], x8)
x38, x37 := bits.mul_u64(arg1[0], arg1[0])
x39, x40 := bits.add_u64(x21, x17, u64(0x0))
x41, _ := bits.add_u64(x22, x18, u64(fiat.u1(x40)))
x43, x44 := bits.add_u64(x37, x39, u64(0x0))
x45, _ := bits.add_u64(x38, x41, u64(fiat.u1(x44)))
x47 := ((x43 >> 51) | ((x45 << 13) & 0xffffffffffffffff))
x48 := (x43 & 0x7ffffffffffff)
x49, x50 := bits.add_u64(x23, x19, u64(0x0))
x51, _ := bits.add_u64(x24, x20, u64(fiat.u1(x50)))
x53, x54 := bits.add_u64(x29, x49, u64(0x0))
x55, _ := bits.add_u64(x30, x51, u64(fiat.u1(x54)))
x57, x58 := bits.add_u64(x25, x9, u64(0x0))
x59, _ := bits.add_u64(x26, x10, u64(fiat.u1(x58)))
x61, x62 := bits.add_u64(x31, x57, u64(0x0))
x63, _ := bits.add_u64(x32, x59, u64(fiat.u1(x62)))
x65, x66 := bits.add_u64(x27, x11, u64(0x0))
x67, _ := bits.add_u64(x28, x12, u64(fiat.u1(x66)))
x69, x70 := bits.add_u64(x33, x65, u64(0x0))
x71, _ := bits.add_u64(x34, x67, u64(fiat.u1(x70)))
x73, x74 := bits.add_u64(x15, x13, u64(0x0))
x75, _ := bits.add_u64(x16, x14, u64(fiat.u1(x74)))
x77, x78 := bits.add_u64(x35, x73, u64(0x0))
x79, _ := bits.add_u64(x36, x75, u64(fiat.u1(x78)))
x81, x82 := bits.add_u64(x47, x77, u64(0x0))
x83 := (u64(fiat.u1(x82)) + x79)
x84 := ((x81 >> 51) | ((x83 << 13) & 0xffffffffffffffff))
x85 := (x81 & 0x7ffffffffffff)
x86, x87 := bits.add_u64(x84, x69, u64(0x0))
x88 := (u64(fiat.u1(x87)) + x71)
x89 := ((x86 >> 51) | ((x88 << 13) & 0xffffffffffffffff))
x90 := (x86 & 0x7ffffffffffff)
x91, x92 := bits.add_u64(x89, x61, u64(0x0))
x93 := (u64(fiat.u1(x92)) + x63)
x94 := ((x91 >> 51) | ((x93 << 13) & 0xffffffffffffffff))
x95 := (x91 & 0x7ffffffffffff)
x96, x97 := bits.add_u64(x94, x53, u64(0x0))
x98 := (u64(fiat.u1(x97)) + x55)
x99 := ((x96 >> 51) | ((x98 << 13) & 0xffffffffffffffff))
x100 := (x96 & 0x7ffffffffffff)
x101 := (x99 * 0x13)
x102 := (x48 + x101)
x103 := (x102 >> 51)
x104 := (x102 & 0x7ffffffffffff)
x105 := (x103 + x85)
x106 := fiat.u1((x105 >> 51))
x107 := (x105 & 0x7ffffffffffff)
x108 := (u64(x106) + x90)
out1[0] = x104
out1[1] = x107
out1[2] = x108
out1[3] = x95
out1[4] = x100
}
fe_carry :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) {
x1 := arg1[0]
x2 := ((x1 >> 51) + arg1[1])
x3 := ((x2 >> 51) + arg1[2])
x4 := ((x3 >> 51) + arg1[3])
x5 := ((x4 >> 51) + arg1[4])
x6 := ((x1 & 0x7ffffffffffff) + ((x5 >> 51) * 0x13))
x7 := (u64(fiat.u1((x6 >> 51))) + (x2 & 0x7ffffffffffff))
x8 := (x6 & 0x7ffffffffffff)
x9 := (x7 & 0x7ffffffffffff)
x10 := (u64(fiat.u1((x7 >> 51))) + (x3 & 0x7ffffffffffff))
x11 := (x4 & 0x7ffffffffffff)
x12 := (x5 & 0x7ffffffffffff)
out1[0] = x8
out1[1] = x9
out1[2] = x10
out1[3] = x11
out1[4] = x12
}
fe_add :: proc "contextless" (out1: ^Loose_Field_Element, arg1, arg2: ^Tight_Field_Element) {
x1 := (arg1[0] + arg2[0])
x2 := (arg1[1] + arg2[1])
x3 := (arg1[2] + arg2[2])
x4 := (arg1[3] + arg2[3])
x5 := (arg1[4] + arg2[4])
out1[0] = x1
out1[1] = x2
out1[2] = x3
out1[3] = x4
out1[4] = x5
}
fe_sub :: proc "contextless" (out1: ^Loose_Field_Element, arg1, arg2: ^Tight_Field_Element) {
x1 := ((0xfffffffffffda + arg1[0]) - arg2[0])
x2 := ((0xffffffffffffe + arg1[1]) - arg2[1])
x3 := ((0xffffffffffffe + arg1[2]) - arg2[2])
x4 := ((0xffffffffffffe + arg1[3]) - arg2[3])
x5 := ((0xffffffffffffe + arg1[4]) - arg2[4])
out1[0] = x1
out1[1] = x2
out1[2] = x3
out1[3] = x4
out1[4] = x5
}
fe_opp :: proc "contextless" (out1: ^Loose_Field_Element, arg1: ^Tight_Field_Element) {
x1 := (0xfffffffffffda - arg1[0])
x2 := (0xffffffffffffe - arg1[1])
x3 := (0xffffffffffffe - arg1[2])
x4 := (0xffffffffffffe - arg1[3])
x5 := (0xffffffffffffe - arg1[4])
out1[0] = x1
out1[1] = x2
out1[2] = x3
out1[3] = x4
out1[4] = x5
}
fe_cond_assign :: proc "contextless" (out1, arg1: ^Tight_Field_Element, arg2: int) {
x1 := fiat.cmovznz_u64(fiat.u1(arg2), out1[0], arg1[0])
x2 := fiat.cmovznz_u64(fiat.u1(arg2), out1[1], arg1[1])
x3 := fiat.cmovznz_u64(fiat.u1(arg2), out1[2], arg1[2])
x4 := fiat.cmovznz_u64(fiat.u1(arg2), out1[3], arg1[3])
x5 := fiat.cmovznz_u64(fiat.u1(arg2), out1[4], arg1[4])
out1[0] = x1
out1[1] = x2
out1[2] = x3
out1[3] = x4
out1[4] = x5
}
fe_to_bytes :: proc "contextless" (out1: ^[32]byte, arg1: ^Tight_Field_Element) {
x1, x2 := _subborrowx_u51(0x0, arg1[0], 0x7ffffffffffed)
x3, x4 := _subborrowx_u51(x2, arg1[1], 0x7ffffffffffff)
x5, x6 := _subborrowx_u51(x4, arg1[2], 0x7ffffffffffff)
x7, x8 := _subborrowx_u51(x6, arg1[3], 0x7ffffffffffff)
x9, x10 := _subborrowx_u51(x8, arg1[4], 0x7ffffffffffff)
x11 := fiat.cmovznz_u64(x10, u64(0x0), 0xffffffffffffffff)
x12, x13 := _addcarryx_u51(0x0, x1, (x11 & 0x7ffffffffffed))
x14, x15 := _addcarryx_u51(x13, x3, (x11 & 0x7ffffffffffff))
x16, x17 := _addcarryx_u51(x15, x5, (x11 & 0x7ffffffffffff))
x18, x19 := _addcarryx_u51(x17, x7, (x11 & 0x7ffffffffffff))
x20, _ := _addcarryx_u51(x19, x9, (x11 & 0x7ffffffffffff))
x22 := (x20 << 4)
x23 := (x18 * u64(0x2))
x24 := (x16 << 6)
x25 := (x14 << 3)
x26 := (u8(x12) & 0xff)
x27 := (x12 >> 8)
x28 := (u8(x27) & 0xff)
x29 := (x27 >> 8)
x30 := (u8(x29) & 0xff)
x31 := (x29 >> 8)
x32 := (u8(x31) & 0xff)
x33 := (x31 >> 8)
x34 := (u8(x33) & 0xff)
x35 := (x33 >> 8)
x36 := (u8(x35) & 0xff)
x37 := u8((x35 >> 8))
x38 := (x25 + u64(x37))
x39 := (u8(x38) & 0xff)
x40 := (x38 >> 8)
x41 := (u8(x40) & 0xff)
x42 := (x40 >> 8)
x43 := (u8(x42) & 0xff)
x44 := (x42 >> 8)
x45 := (u8(x44) & 0xff)
x46 := (x44 >> 8)
x47 := (u8(x46) & 0xff)
x48 := (x46 >> 8)
x49 := (u8(x48) & 0xff)
x50 := u8((x48 >> 8))
x51 := (x24 + u64(x50))
x52 := (u8(x51) & 0xff)
x53 := (x51 >> 8)
x54 := (u8(x53) & 0xff)
x55 := (x53 >> 8)
x56 := (u8(x55) & 0xff)
x57 := (x55 >> 8)
x58 := (u8(x57) & 0xff)
x59 := (x57 >> 8)
x60 := (u8(x59) & 0xff)
x61 := (x59 >> 8)
x62 := (u8(x61) & 0xff)
x63 := (x61 >> 8)
x64 := (u8(x63) & 0xff)
x65 := fiat.u1((x63 >> 8))
x66 := (x23 + u64(x65))
x67 := (u8(x66) & 0xff)
x68 := (x66 >> 8)
x69 := (u8(x68) & 0xff)
x70 := (x68 >> 8)
x71 := (u8(x70) & 0xff)
x72 := (x70 >> 8)
x73 := (u8(x72) & 0xff)
x74 := (x72 >> 8)
x75 := (u8(x74) & 0xff)
x76 := (x74 >> 8)
x77 := (u8(x76) & 0xff)
x78 := u8((x76 >> 8))
x79 := (x22 + u64(x78))
x80 := (u8(x79) & 0xff)
x81 := (x79 >> 8)
x82 := (u8(x81) & 0xff)
x83 := (x81 >> 8)
x84 := (u8(x83) & 0xff)
x85 := (x83 >> 8)
x86 := (u8(x85) & 0xff)
x87 := (x85 >> 8)
x88 := (u8(x87) & 0xff)
x89 := (x87 >> 8)
x90 := (u8(x89) & 0xff)
x91 := u8((x89 >> 8))
out1[0] = x26
out1[1] = x28
out1[2] = x30
out1[3] = x32
out1[4] = x34
out1[5] = x36
out1[6] = x39
out1[7] = x41
out1[8] = x43
out1[9] = x45
out1[10] = x47
out1[11] = x49
out1[12] = x52
out1[13] = x54
out1[14] = x56
out1[15] = x58
out1[16] = x60
out1[17] = x62
out1[18] = x64
out1[19] = x67
out1[20] = x69
out1[21] = x71
out1[22] = x73
out1[23] = x75
out1[24] = x77
out1[25] = x80
out1[26] = x82
out1[27] = x84
out1[28] = x86
out1[29] = x88
out1[30] = x90
out1[31] = x91
}
_fe_from_bytes :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^[32]byte) {
x1 := (u64(arg1[31]) << 44)
x2 := (u64(arg1[30]) << 36)
x3 := (u64(arg1[29]) << 28)
x4 := (u64(arg1[28]) << 20)
x5 := (u64(arg1[27]) << 12)
x6 := (u64(arg1[26]) << 4)
x7 := (u64(arg1[25]) << 47)
x8 := (u64(arg1[24]) << 39)
x9 := (u64(arg1[23]) << 31)
x10 := (u64(arg1[22]) << 23)
x11 := (u64(arg1[21]) << 15)
x12 := (u64(arg1[20]) << 7)
x13 := (u64(arg1[19]) << 50)
x14 := (u64(arg1[18]) << 42)
x15 := (u64(arg1[17]) << 34)
x16 := (u64(arg1[16]) << 26)
x17 := (u64(arg1[15]) << 18)
x18 := (u64(arg1[14]) << 10)
x19 := (u64(arg1[13]) << 2)
x20 := (u64(arg1[12]) << 45)
x21 := (u64(arg1[11]) << 37)
x22 := (u64(arg1[10]) << 29)
x23 := (u64(arg1[9]) << 21)
x24 := (u64(arg1[8]) << 13)
x25 := (u64(arg1[7]) << 5)
x26 := (u64(arg1[6]) << 48)
x27 := (u64(arg1[5]) << 40)
x28 := (u64(arg1[4]) << 32)
x29 := (u64(arg1[3]) << 24)
x30 := (u64(arg1[2]) << 16)
x31 := (u64(arg1[1]) << 8)
x32 := arg1[0]
x33 := (x31 + u64(x32))
x34 := (x30 + x33)
x35 := (x29 + x34)
x36 := (x28 + x35)
x37 := (x27 + x36)
x38 := (x26 + x37)
x39 := (x38 & 0x7ffffffffffff)
x40 := u8((x38 >> 51))
x41 := (x25 + u64(x40))
x42 := (x24 + x41)
x43 := (x23 + x42)
x44 := (x22 + x43)
x45 := (x21 + x44)
x46 := (x20 + x45)
x47 := (x46 & 0x7ffffffffffff)
x48 := u8((x46 >> 51))
x49 := (x19 + u64(x48))
x50 := (x18 + x49)
x51 := (x17 + x50)
x52 := (x16 + x51)
x53 := (x15 + x52)
x54 := (x14 + x53)
x55 := (x13 + x54)
x56 := (x55 & 0x7ffffffffffff)
x57 := u8((x55 >> 51))
x58 := (x12 + u64(x57))
x59 := (x11 + x58)
x60 := (x10 + x59)
x61 := (x9 + x60)
x62 := (x8 + x61)
x63 := (x7 + x62)
x64 := (x63 & 0x7ffffffffffff)
x65 := u8((x63 >> 51))
x66 := (x6 + u64(x65))
x67 := (x5 + x66)
x68 := (x4 + x67)
x69 := (x3 + x68)
x70 := (x2 + x69)
x71 := (x1 + x70)
out1[0] = x39
out1[1] = x47
out1[2] = x56
out1[3] = x64
out1[4] = x71
}
fe_relax :: proc "contextless" (out1: ^Loose_Field_Element, arg1: ^Tight_Field_Element) {
x1 := arg1[0]
x2 := arg1[1]
x3 := arg1[2]
x4 := arg1[3]
x5 := arg1[4]
out1[0] = x1
out1[1] = x2
out1[2] = x3
out1[3] = x4
out1[4] = x5
}
fe_carry_scmul_121666 :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) {
x2, x1 := bits.mul_u64(0x1db42, arg1[4])
x4, x3 := bits.mul_u64(0x1db42, arg1[3])
x6, x5 := bits.mul_u64(0x1db42, arg1[2])
x8, x7 := bits.mul_u64(0x1db42, arg1[1])
x10, x9 := bits.mul_u64(0x1db42, arg1[0])
x11 := ((x9 >> 51) | ((x10 << 13) & 0xffffffffffffffff))
x12 := (x9 & 0x7ffffffffffff)
x13, x14 := bits.add_u64(x11, x7, u64(0x0))
x15 := (u64(fiat.u1(x14)) + x8)
x16 := ((x13 >> 51) | ((x15 << 13) & 0xffffffffffffffff))
x17 := (x13 & 0x7ffffffffffff)
x18, x19 := bits.add_u64(x16, x5, u64(0x0))
x20 := (u64(fiat.u1(x19)) + x6)
x21 := ((x18 >> 51) | ((x20 << 13) & 0xffffffffffffffff))
x22 := (x18 & 0x7ffffffffffff)
x23, x24 := bits.add_u64(x21, x3, u64(0x0))
x25 := (u64(fiat.u1(x24)) + x4)
x26 := ((x23 >> 51) | ((x25 << 13) & 0xffffffffffffffff))
x27 := (x23 & 0x7ffffffffffff)
x28, x29 := bits.add_u64(x26, x1, u64(0x0))
x30 := (u64(fiat.u1(x29)) + x2)
x31 := ((x28 >> 51) | ((x30 << 13) & 0xffffffffffffffff))
x32 := (x28 & 0x7ffffffffffff)
x33 := (x31 * 0x13)
x34 := (x12 + x33)
x35 := fiat.u1((x34 >> 51))
x36 := (x34 & 0x7ffffffffffff)
x37 := (u64(x35) + x17)
x38 := fiat.u1((x37 >> 51))
x39 := (x37 & 0x7ffffffffffff)
x40 := (u64(x38) + x22)
out1[0] = x36
out1[1] = x39
out1[2] = x40
out1[3] = x27
out1[4] = x32
}
// The following routines were added by hand, and do not come from fiat-crypto.
fe_zero :: proc "contextless" (out1: ^Tight_Field_Element) {
out1[0] = 0
out1[1] = 0
out1[2] = 0
out1[3] = 0
out1[4] = 0
}
fe_one :: proc "contextless" (out1: ^Tight_Field_Element) {
out1[0] = 1
out1[1] = 0
out1[2] = 0
out1[3] = 0
out1[4] = 0
}
fe_set :: proc "contextless" (out1, arg1: ^Tight_Field_Element) {
x1 := arg1[0]
x2 := arg1[1]
x3 := arg1[2]
x4 := arg1[3]
x5 := arg1[4]
out1[0] = x1
out1[1] = x2
out1[2] = x3
out1[3] = x4
out1[4] = x5
}
fe_cond_swap :: proc "contextless" (out1, out2: ^Tight_Field_Element, arg1: int) {
mask := -u64(arg1)
x := (out1[0] ~ out2[0]) & mask
x1, y1 := out1[0] ~ x, out2[0] ~ x
x = (out1[1] ~ out2[1]) & mask
x2, y2 := out1[1] ~ x, out2[1] ~ x
x = (out1[2] ~ out2[2]) & mask
x3, y3 := out1[2] ~ x, out2[2] ~ x
x = (out1[3] ~ out2[3]) & mask
x4, y4 := out1[3] ~ x, out2[3] ~ x
x = (out1[4] ~ out2[4]) & mask
x5, y5 := out1[4] ~ x, out2[4] ~ x
out1[0], out2[0] = x1, y1
out1[1], out2[1] = x2, y2
out1[2], out2[2] = x3, y3
out1[3], out2[3] = x4, y4
out1[4], out2[4] = x5, y5
}
+126
View File
@@ -0,0 +1,126 @@
package x25519
import field "core:crypto/_fiat/field_curve25519"
import "core:mem"
SCALAR_SIZE :: 32
POINT_SIZE :: 32
_BASE_POINT: [32]byte = {9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
_scalar_bit :: #force_inline proc "contextless" (s: ^[32]byte, i: int) -> u8 {
if i < 0 {
return 0
}
return (s[i>>3] >> uint(i&7)) & 1
}
_scalarmult :: proc (out, scalar, point: ^[32]byte) {
// Montgomery pseduo-multiplication taken from Monocypher.
// computes the scalar product
x1: field.Tight_Field_Element = ---
field.fe_from_bytes(&x1, point)
// computes the actual scalar product (the result is in x2 and z2)
x2, x3, z2, z3: field.Tight_Field_Element = ---, ---, ---, ---
t0, t1: field.Loose_Field_Element = ---, ---
// Montgomery ladder
// In projective coordinates, to avoid divisions: x = X / Z
// We don't care about the y coordinate, it's only 1 bit of information
field.fe_one(&x2) // "zero" point
field.fe_zero(&z2)
field.fe_set(&x3, &x1) // "one" point
field.fe_one(&z3)
swap: int
for pos := 255-1; pos >= 0; pos = pos - 1 {
// constant time conditional swap before ladder step
b := int(_scalar_bit(scalar, pos))
swap ~= b // xor trick avoids swapping at the end of the loop
field.fe_cond_swap(&x2, &x3, swap)
field.fe_cond_swap(&z2, &z3, swap)
swap = b // anticipates one last swap after the loop
// Montgomery ladder step: replaces (P2, P3) by (P2*2, P2+P3)
// with differential addition
//
// Note: This deliberately omits reductions after add/sub operations
// if the result is only ever used as the input to a mul/square since
// the implementations of those can deal with non-reduced inputs.
//
// fe_tighten_cast is only used to store a fully reduced
// output in a Loose_Field_Element, or to provide such a
// Loose_Field_Element as a Tight_Field_Element argument.
field.fe_sub(&t0, &x3, &z3)
field.fe_sub(&t1, &x2, &z2)
field.fe_add(field.fe_relax_cast(&x2), &x2, &z2) // x2 - unreduced
field.fe_add(field.fe_relax_cast(&z2), &x3, &z3) // z2 - unreduced
field.fe_carry_mul(&z3, &t0, field.fe_relax_cast(&x2))
field.fe_carry_mul(&z2, field.fe_relax_cast(&z2), &t1) // z2 - reduced
field.fe_carry_square(field.fe_tighten_cast(&t0), &t1) // t0 - reduced
field.fe_carry_square(field.fe_tighten_cast(&t1), field.fe_relax_cast(&x2)) // t1 - reduced
field.fe_add(field.fe_relax_cast(&x3), &z3, &z2) // x3 - unreduced
field.fe_sub(field.fe_relax_cast(&z2), &z3, &z2) // z2 - unreduced
field.fe_carry_mul(&x2, &t1, &t0) // x2 - reduced
field.fe_sub(&t1, field.fe_tighten_cast(&t1), field.fe_tighten_cast(&t0)) // safe - t1/t0 is reduced
field.fe_carry_square(&z2, field.fe_relax_cast(&z2)) // z2 - reduced
field.fe_carry_scmul_121666(&z3, &t1)
field.fe_carry_square(&x3, field.fe_relax_cast(&x3)) // x3 - reduced
field.fe_add(&t0, field.fe_tighten_cast(&t0), &z3) // safe - t0 is reduced
field.fe_carry_mul(&z3, field.fe_relax_cast(&x1), field.fe_relax_cast(&z2))
field.fe_carry_mul(&z2, &t1, &t0)
}
// last swap is necessary to compensate for the xor trick
// Note: after this swap, P3 == P2 + P1.
field.fe_cond_swap(&x2, &x3, swap)
field.fe_cond_swap(&z2, &z3, swap)
// normalises the coordinates: x == X / Z
field.fe_carry_inv(&z2, field.fe_relax_cast(&z2))
field.fe_carry_mul(&x2, field.fe_relax_cast(&x2), field.fe_relax_cast(&z2))
field.fe_to_bytes(out, &x2)
mem.zero_explicit(&x1, size_of(x1))
mem.zero_explicit(&x2, size_of(x2))
mem.zero_explicit(&x3, size_of(x3))
mem.zero_explicit(&z2, size_of(z2))
mem.zero_explicit(&z3, size_of(z3))
mem.zero_explicit(&t0, size_of(t0))
mem.zero_explicit(&t1, size_of(t1))
}
scalarmult :: proc (dst, scalar, point: []byte) {
if len(scalar) != SCALAR_SIZE {
panic("crypto/x25519: invalid scalar size")
}
if len(point) != POINT_SIZE {
panic("crypto/x25519: invalid point size")
}
if len(dst) != POINT_SIZE {
panic("crypto/x25519: invalid destination point size")
}
// "clamp" the scalar
e: [32]byte = ---
copy_slice(e[:], scalar)
e[0] &= 248
e[31] &= 127
e[31] |= 64
p: [32]byte = ---
copy_slice(p[:], point)
d: [32]byte = ---
_scalarmult(&d, &e, &p)
copy_slice(dst, d[:])
mem.zero_explicit(&e, size_of(e))
mem.zero_explicit(&d, size_of(d))
}
scalarmult_basepoint :: proc (dst, scalar: []byte) {
// TODO/perf: Switch to using a precomputed table.
scalarmult(dst, scalar, _BASE_POINT[:])
}
+5
View File
@@ -115,6 +115,11 @@ main :: proc() {
test_haval_224(&t)
test_haval_256(&t)
// "modern" crypto tests
test_x25519(&t)
bench_modern(&t)
fmt.printf("%v/%v tests successful.\n", TEST_count - TEST_fail, TEST_count)
}
@@ -0,0 +1,95 @@
package test_core_crypto
import "core:testing"
import "core:fmt"
import "core:time"
import "core:crypto/x25519"
_digit_value :: proc(r: rune) -> int {
ri := int(r)
v: int = 16
switch r {
case '0'..='9': v = ri-'0'
case 'a'..='z': v = ri-'a'+10
case 'A'..='Z': v = ri-'A'+10
}
return v
}
_decode_hex32 :: proc(s: string) -> [32]byte{
b: [32]byte
for i := 0; i < len(s); i = i + 2 {
hi := _digit_value(rune(s[i]))
lo := _digit_value(rune(s[i+1]))
b[i/2] = byte(hi << 4 | lo)
}
return b
}
TestECDH :: struct {
scalar: string,
point: string,
product: string,
}
@(test)
test_x25519 :: proc(t: ^testing.T) {
log(t, "Testing X25519")
test_vectors := [?]TestECDH {
// Test vectors from RFC 7748
TestECDH{
"a546e36bf0527c9d3b16154b82465edd62144c0ac1fc5a18506a2244ba449ac4",
"e6db6867583030db3594c1a424b15f7c726624ec26b3353b10a903a6d0ab1c4c",
"c3da55379de9c6908e94ea4df28d084f32eccf03491c71f754b4075577a28552",
},
TestECDH{
"4b66e9d4d1b4673c5ad22691957d6af5c11b6421e0ea01d42ca4169e7918ba0d",
"e5210f12786811d3f4b7959d0538ae2c31dbe7106fc03c3efc4cd549c715a493",
"95cbde9476e8907d7aade45cb4b873f88b595a68799fa152e6f8f7647aac7957",
},
}
for v, _ in test_vectors {
scalar := _decode_hex32(v.scalar)
point := _decode_hex32(v.point)
derived_point: [x25519.POINT_SIZE]byte
x25519.scalarmult(derived_point[:], scalar[:], point[:])
derived_point_str := hex_string(derived_point[:])
expect(t, derived_point_str == v.product, fmt.tprintf("Expected %s for %s * %s, but got %s instead", v.product, v.scalar, v.point, derived_point_str))
// Abuse the test vectors to sanity-check the scalar-basepoint multiply.
p1, p2: [x25519.POINT_SIZE]byte
x25519.scalarmult_basepoint(p1[:], scalar[:])
x25519.scalarmult(p2[:], scalar[:], x25519._BASE_POINT[:])
p1_str, p2_str := hex_string(p1[:]), hex_string(p2[:])
expect(t, p1_str == p2_str, fmt.tprintf("Expected %s for %s * basepoint, but got %s instead", p2_str, v.scalar, p1_str))
}
// TODO/tests: Run the wycheproof test vectors, once I figure out
// how to work with JSON.
}
@(test)
bench_modern :: proc(t: ^testing.T) {
fmt.println("Starting benchmarks:")
bench_x25519(t)
}
bench_x25519 :: proc(t: ^testing.T) {
point := _decode_hex32("deadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeef")
scalar := _decode_hex32("cafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabe")
out: [x25519.POINT_SIZE]byte = ---
iters :: 10000
start := time.now()
for i := 0; i < iters; i = i + 1 {
x25519.scalarmult(out[:], scalar[:], point[:])
}
elapsed := time.since(start)
log(t, fmt.tprintf("x25519.scalarmult: ~%f us/op", time.duration_microseconds(elapsed) / iters))
}