core/crypto: Add x25519

This package implements the X25519 key agreement scheme as specified in RFC 7748, using routines taken from fiat-crypto and Monocypher.
2026-07-22 23:47:51 +00:00 · 2021-11-17 13:59:53 +00:00
parent d1e76ee4f2
commit 1a7a6a9116
7 changed files with 1039 additions and 0 deletions
@@ -0,0 +1,35 @@
+# fiat
+
+This package contains low level arithmetic required to implement certain
+cryptographic primitives, ported from the [fiat-crypto project][1]
+along with some higher-level helpers.
+
+## Notes
+
+fiat-crypto gives the choice of 3 licenses for derived works.  The 1-Clause
+BSD license is chosen as it is compatible with Odin's existing licensing.
+
+The routines are intended to be timing-safe, as long as the underlying
+integer arithmetic is constant time.  This is true on most systems commonly
+used today, with the notable exception of WASM.
+
+While fiat-crypto provides both output targeting both 32-bit and 64-bit
+architectures, only the 64-bit versions were used, as 32-bit architectures
+are becoming increasingly uncommon and irrelevant.
+
+With the current Odin syntax, the Go output is trivially ported in most
+cases and was used as the basis of the port.
+
+In the future, it would be better to auto-generate Odin either directly
+by adding an appropriate code-gen backend written in Coq, or perhaps by
+parsing the JSON output.
+
+As this is a port rather than autogenerated output, none of fiat-crypto's
+formal verification guarantees apply, unless it is possible to prove binary
+equivalence.
+
+For the most part, alterations to the base fiat-crypto generated code was
+kept to a minimum, to aid auditability.  This results in a somewhat
+ideosyncratic style, and in some cases minor performance penalties.
+
+[1]: https://github.com/mit-plv/fiat-crypto
@@ -0,0 +1,24 @@
+package fiat
+
+// This package provides various helpers and types common to all of the
+// fiat-crypto derived backends.
+
+// This code only works on a two's complement system.
+#assert((-1 & 3) == 3)
+
+u1 :: distinct u8
+i1 :: distinct i8
+
+cmovznz_u64 :: #force_inline proc "contextless" (arg1: u1, arg2, arg3: u64) -> (out1: u64) {
+	x1 := (u64(arg1) * 0xffffffffffffffff)
+	x2 := ((x1 & arg3) | ((~x1) & arg2))
+	out1 = x2
+	return
+}
+
+cmovznz_u32 :: #force_inline proc "contextless" (arg1: u1, arg2, arg3: u32) -> (out1: u32) {
+	x1 := (u32(arg1) * 0xffffffff)
+	x2 := ((x1 & arg3) | ((~x1) & arg2))
+	out1 = x2
+	return
+}
@@ -0,0 +1,138 @@
+package field_curve25519
+
+import "core:crypto"
+import "core:mem"
+
+fe_relax_cast :: #force_inline proc "contextless" (arg1: ^Tight_Field_Element) -> ^Loose_Field_Element {
+	return transmute(^Loose_Field_Element)(arg1)
+}
+
+fe_tighten_cast :: #force_inline proc "contextless" (arg1: ^Loose_Field_Element) -> ^Tight_Field_Element {
+	return transmute(^Tight_Field_Element)(arg1)
+}
+
+fe_from_bytes :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^[32]byte) {
+	// Ignore the unused bit by copying the input and masking the bit off
+	// prior to deserialization.
+	tmp1: [32]byte = ---
+	copy_slice(tmp1[:], arg1[:])
+	tmp1[31] &= 127
+
+	_fe_from_bytes(out1, &tmp1)
+
+	mem.zero_explicit(&tmp1, size_of(tmp1))
+}
+
+fe_equal :: proc "contextless" (arg1, arg2: ^Tight_Field_Element) -> int {
+	tmp2: [32]byte = ---
+
+	fe_to_bytes(&tmp2, arg2)
+	ret := fe_equal_bytes(arg1, &tmp2)
+
+	mem.zero_explicit(&tmp2, size_of(tmp2))
+
+	return ret
+}
+
+fe_equal_bytes :: proc "contextless" (arg1: ^Tight_Field_Element, arg2: ^[32]byte) -> int {
+	tmp1: [32]byte = ---
+
+	fe_to_bytes(&tmp1, arg1)
+
+	ret := crypto.compare_constant_time(tmp1[:], arg2[:])
+
+	mem.zero_explicit(&tmp1, size_of(tmp1))
+
+	return ret
+}
+
+fe_carry_pow2k :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element, arg2: uint) {
+	// Special case: `arg1^(2 * 0) = 1`, though this should never happen.
+	if arg2 == 0 {
+		fe_one(out1)
+		return
+	}
+
+	fe_carry_square(out1, arg1)
+	for _ in 1..<arg2 {
+		fe_carry_square(out1, fe_relax_cast(out1))
+	}
+}
+
+fe_carry_opp :: #force_inline proc "contextless" (out1, arg1: ^Tight_Field_Element) {
+	fe_opp(fe_relax_cast(out1), arg1)
+	fe_carry(out1, fe_relax_cast(out1))
+}
+
+fe_carry_invsqrt :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) -> int {
+	// Inverse square root taken from Monocypher.
+
+	tmp1, tmp2, tmp3: Tight_Field_Element = ---, ---, ---
+
+	// t0 = x^((p-5)/8)
+	// Can be achieved with a simple double & add ladder,
+	// but it would be slower.
+	fe_carry_pow2k(&tmp1, arg1, 1)
+	fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp1), 2)
+	fe_carry_mul(&tmp2, arg1, fe_relax_cast(&tmp2))
+	fe_carry_mul(&tmp1, fe_relax_cast(&tmp1), fe_relax_cast(&tmp2))
+	fe_carry_pow2k(&tmp1, fe_relax_cast(&tmp1), 1)
+	fe_carry_mul(&tmp1, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1))
+	fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp1), 5)
+	fe_carry_mul(&tmp1, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1))
+	fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp1), 10)
+	fe_carry_mul(&tmp2, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1))
+	fe_carry_pow2k(&tmp3, fe_relax_cast(&tmp2), 20)
+	fe_carry_mul(&tmp2, fe_relax_cast(&tmp3), fe_relax_cast(&tmp2))
+	fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp2), 10)
+	fe_carry_mul(&tmp1, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1))
+	fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp1), 50)
+	fe_carry_mul(&tmp2, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1))
+	fe_carry_pow2k(&tmp3, fe_relax_cast(&tmp2), 100)
+	fe_carry_mul(&tmp2, fe_relax_cast(&tmp3), fe_relax_cast(&tmp2))
+	fe_carry_pow2k(&tmp2, fe_relax_cast(&tmp2), 50)
+	fe_carry_mul(&tmp1, fe_relax_cast(&tmp2), fe_relax_cast(&tmp1))
+	fe_carry_pow2k(&tmp1, fe_relax_cast(&tmp1), 2)
+	fe_carry_mul(&tmp1, fe_relax_cast(&tmp1), arg1)
+
+	// quartic = x^((p-1)/4)
+	quartic := &tmp2
+	fe_carry_square(quartic, fe_relax_cast(&tmp1))
+	fe_carry_mul(quartic, fe_relax_cast(quartic), arg1)
+
+	// Serialize quartic once to save on repeated serialization/sanitization.
+	quartic_buf: [32]byte = ---
+	fe_to_bytes(&quartic_buf, quartic)
+	check := &tmp3
+
+	fe_one(check)
+	p1 := fe_equal_bytes(check, &quartic_buf)
+	fe_carry_opp(check, check)
+	m1 := fe_equal_bytes(check, &quartic_buf)
+	fe_carry_opp(check, &SQRT_M1)
+	ms := fe_equal_bytes(check, &quartic_buf)
+
+	// if quartic == -1 or sqrt(-1)
+	// then  isr = x^((p-1)/4) * sqrt(-1)
+	// else  isr = x^((p-1)/4)
+	fe_carry_mul(out1, fe_relax_cast(&tmp1), fe_relax_cast(&SQRT_M1))
+	fe_cond_assign(out1, &tmp1, (m1|ms) ~ 1)
+
+	mem.zero_explicit(&tmp1, size_of(tmp1))
+	mem.zero_explicit(&tmp2, size_of(tmp2))
+	mem.zero_explicit(&tmp3, size_of(tmp3))
+	mem.zero_explicit(&quartic_buf, size_of(quartic_buf))
+
+	return p1 | m1
+}
+
+fe_carry_inv :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) {
+	tmp1: Tight_Field_Element
+
+	fe_carry_square(&tmp1, arg1)
+	_ = fe_carry_invsqrt(&tmp1, fe_relax_cast(&tmp1))
+	fe_carry_square(&tmp1, fe_relax_cast(&tmp1))
+	fe_carry_mul(out1, fe_relax_cast(&tmp1), arg1)
+
+	mem.zero_explicit(&tmp1, size_of(tmp1))
+}
@@ -0,0 +1,616 @@
+// The BSD 1-Clause License (BSD-1-Clause)
+//
+// Copyright (c) 2015-2020 the fiat-crypto authors (see the AUTHORS file)
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     1. Redistributions of source code must retain the above copyright
+//        notice, this list of conditions and the following disclaimer.
+//
+// THIS SOFTWARE IS PROVIDED BY the fiat-crypto authors "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Berkeley Software Design,
+// Inc. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+package field_curve25519
+
+// The file provides arithmetic on the field Z/(2^255-19) using
+// unsaturated 64-bit integer arithmetic.  It is derived primarily
+// from the machine generated Golang output from the fiat-crypto project.
+//
+// While the base implementation is provably correct, this implementation
+// makes no such claims as the port and optimizations were done by hand.
+// At some point, it may be worth adding support to fiat-crypto for
+// generating Odin output.
+//
+// TODO:
+//  * When fiat-crypto supports it, using a saturated 64-bit limbs
+//    instead of 51-bit limbs will be faster, though the gains are
+//    minimal unless adcx/adox/mulx are used.
+
+import fiat "core:crypto/_fiat"
+import "core:math/bits"
+
+Loose_Field_Element :: distinct [5]u64
+Tight_Field_Element :: distinct [5]u64
+
+SQRT_M1 := Tight_Field_Element{
+	1718705420411056,
+	234908883556509,
+	2233514472574048,
+	2117202627021982,
+	765476049583133,
+}
+
+_addcarryx_u51 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u64) -> (out1: u64, out2: fiat.u1) {
+	x1 := ((u64(arg1) + arg2) + arg3)
+	x2 := (x1 & 0x7ffffffffffff)
+	x3 := fiat.u1((x1 >> 51))
+	out1 = x2
+	out2 = x3
+	return
+}
+
+_subborrowx_u51 :: #force_inline proc "contextless" (arg1: fiat.u1, arg2, arg3: u64) -> (out1: u64, out2: fiat.u1) {
+	x1 := ((i64(arg2) - i64(arg1)) - i64(arg3))
+	x2 := fiat.i1((x1 >> 51))
+	x3 := (u64(x1) & 0x7ffffffffffff)
+	out1 = x3
+	out2 = (0x0 - fiat.u1(x2))
+	return
+}
+
+fe_carry_mul :: proc (out1: ^Tight_Field_Element, arg1, arg2: ^Loose_Field_Element) {
+	x2, x1 := bits.mul_u64(arg1[4], (arg2[4] * 0x13))
+	x4, x3 := bits.mul_u64(arg1[4], (arg2[3] * 0x13))
+	x6, x5 := bits.mul_u64(arg1[4], (arg2[2] * 0x13))
+	x8, x7 := bits.mul_u64(arg1[4], (arg2[1] * 0x13))
+	x10, x9 := bits.mul_u64(arg1[3], (arg2[4] * 0x13))
+	x12, x11 := bits.mul_u64(arg1[3], (arg2[3] * 0x13))
+	x14, x13 := bits.mul_u64(arg1[3], (arg2[2] * 0x13))
+	x16, x15 := bits.mul_u64(arg1[2], (arg2[4] * 0x13))
+	x18, x17 := bits.mul_u64(arg1[2], (arg2[3] * 0x13))
+	x20, x19 := bits.mul_u64(arg1[1], (arg2[4] * 0x13))
+	x22, x21 := bits.mul_u64(arg1[4], arg2[0])
+	x24, x23 := bits.mul_u64(arg1[3], arg2[1])
+	x26, x25 := bits.mul_u64(arg1[3], arg2[0])
+	x28, x27 := bits.mul_u64(arg1[2], arg2[2])
+	x30, x29 := bits.mul_u64(arg1[2], arg2[1])
+	x32, x31 := bits.mul_u64(arg1[2], arg2[0])
+	x34, x33 := bits.mul_u64(arg1[1], arg2[3])
+	x36, x35 := bits.mul_u64(arg1[1], arg2[2])
+	x38, x37 := bits.mul_u64(arg1[1], arg2[1])
+	x40, x39 := bits.mul_u64(arg1[1], arg2[0])
+	x42, x41 := bits.mul_u64(arg1[0], arg2[4])
+	x44, x43 := bits.mul_u64(arg1[0], arg2[3])
+	x46, x45 := bits.mul_u64(arg1[0], arg2[2])
+	x48, x47 := bits.mul_u64(arg1[0], arg2[1])
+	x50, x49 := bits.mul_u64(arg1[0], arg2[0])
+	x51, x52 := bits.add_u64(x13, x7, u64(0x0))
+	x53, _ := bits.add_u64(x14, x8, u64(fiat.u1(x52)))
+	x55, x56 := bits.add_u64(x17, x51, u64(0x0))
+	x57, _ := bits.add_u64(x18, x53, u64(fiat.u1(x56)))
+	x59, x60 := bits.add_u64(x19, x55, u64(0x0))
+	x61, _ := bits.add_u64(x20, x57, u64(fiat.u1(x60)))
+	x63, x64 := bits.add_u64(x49, x59, u64(0x0))
+	x65, _ := bits.add_u64(x50, x61, u64(fiat.u1(x64)))
+	x67 := ((x63 >> 51) | ((x65 << 13) & 0xffffffffffffffff))
+	x68 := (x63 & 0x7ffffffffffff)
+	x69, x70 := bits.add_u64(x23, x21, u64(0x0))
+	x71, _ := bits.add_u64(x24, x22, u64(fiat.u1(x70)))
+	x73, x74 := bits.add_u64(x27, x69, u64(0x0))
+	x75, _ := bits.add_u64(x28, x71, u64(fiat.u1(x74)))
+	x77, x78 := bits.add_u64(x33, x73, u64(0x0))
+	x79, _ := bits.add_u64(x34, x75, u64(fiat.u1(x78)))
+	x81, x82 := bits.add_u64(x41, x77, u64(0x0))
+	x83, _ := bits.add_u64(x42, x79, u64(fiat.u1(x82)))
+	x85, x86 := bits.add_u64(x25, x1, u64(0x0))
+	x87, _ := bits.add_u64(x26, x2, u64(fiat.u1(x86)))
+	x89, x90 := bits.add_u64(x29, x85, u64(0x0))
+	x91, _ := bits.add_u64(x30, x87, u64(fiat.u1(x90)))
+	x93, x94 := bits.add_u64(x35, x89, u64(0x0))
+	x95, _ := bits.add_u64(x36, x91, u64(fiat.u1(x94)))
+	x97, x98 := bits.add_u64(x43, x93, u64(0x0))
+	x99, _ := bits.add_u64(x44, x95, u64(fiat.u1(x98)))
+	x101, x102 := bits.add_u64(x9, x3, u64(0x0))
+	x103, _ := bits.add_u64(x10, x4, u64(fiat.u1(x102)))
+	x105, x106 := bits.add_u64(x31, x101, u64(0x0))
+	x107, _ := bits.add_u64(x32, x103, u64(fiat.u1(x106)))
+	x109, x110 := bits.add_u64(x37, x105, u64(0x0))
+	x111, _ := bits.add_u64(x38, x107, u64(fiat.u1(x110)))
+	x113, x114 := bits.add_u64(x45, x109, u64(0x0))
+	x115, _ := bits.add_u64(x46, x111, u64(fiat.u1(x114)))
+	x117, x118 := bits.add_u64(x11, x5, u64(0x0))
+	x119, _ := bits.add_u64(x12, x6, u64(fiat.u1(x118)))
+	x121, x122 := bits.add_u64(x15, x117, u64(0x0))
+	x123, _ := bits.add_u64(x16, x119, u64(fiat.u1(x122)))
+	x125, x126 := bits.add_u64(x39, x121, u64(0x0))
+	x127, _ := bits.add_u64(x40, x123, u64(fiat.u1(x126)))
+	x129, x130 := bits.add_u64(x47, x125, u64(0x0))
+	x131, _ := bits.add_u64(x48, x127, u64(fiat.u1(x130)))
+	x133, x134 := bits.add_u64(x67, x129, u64(0x0))
+	x135 := (u64(fiat.u1(x134)) + x131)
+	x136 := ((x133 >> 51) | ((x135 << 13) & 0xffffffffffffffff))
+	x137 := (x133 & 0x7ffffffffffff)
+	x138, x139 := bits.add_u64(x136, x113, u64(0x0))
+	x140 := (u64(fiat.u1(x139)) + x115)
+	x141 := ((x138 >> 51) | ((x140 << 13) & 0xffffffffffffffff))
+	x142 := (x138 & 0x7ffffffffffff)
+	x143, x144 := bits.add_u64(x141, x97, u64(0x0))
+	x145 := (u64(fiat.u1(x144)) + x99)
+	x146 := ((x143 >> 51) | ((x145 << 13) & 0xffffffffffffffff))
+	x147 := (x143 & 0x7ffffffffffff)
+	x148, x149 := bits.add_u64(x146, x81, u64(0x0))
+	x150 := (u64(fiat.u1(x149)) + x83)
+	x151 := ((x148 >> 51) | ((x150 << 13) & 0xffffffffffffffff))
+	x152 := (x148 & 0x7ffffffffffff)
+	x153 := (x151 * 0x13)
+	x154 := (x68 + x153)
+	x155 := (x154 >> 51)
+	x156 := (x154 & 0x7ffffffffffff)
+	x157 := (x155 + x137)
+	x158 := fiat.u1((x157 >> 51))
+	x159 := (x157 & 0x7ffffffffffff)
+	x160 := (u64(x158) + x142)
+	out1[0] = x156
+	out1[1] = x159
+	out1[2] = x160
+	out1[3] = x147
+	out1[4] = x152
+}
+
+fe_carry_square :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) {
+	x1 := (arg1[4] * 0x13)
+	x2 := (x1 * 0x2)
+	x3 := (arg1[4] * 0x2)
+	x4 := (arg1[3] * 0x13)
+	x5 := (x4 * 0x2)
+	x6 := (arg1[3] * 0x2)
+	x7 := (arg1[2] * 0x2)
+	x8 := (arg1[1] * 0x2)
+	x10, x9 := bits.mul_u64(arg1[4], x1)
+	x12, x11 := bits.mul_u64(arg1[3], x2)
+	x14, x13 := bits.mul_u64(arg1[3], x4)
+	x16, x15 := bits.mul_u64(arg1[2], x2)
+	x18, x17 := bits.mul_u64(arg1[2], x5)
+	x20, x19 := bits.mul_u64(arg1[2], arg1[2])
+	x22, x21 := bits.mul_u64(arg1[1], x2)
+	x24, x23 := bits.mul_u64(arg1[1], x6)
+	x26, x25 := bits.mul_u64(arg1[1], x7)
+	x28, x27 := bits.mul_u64(arg1[1], arg1[1])
+	x30, x29 := bits.mul_u64(arg1[0], x3)
+	x32, x31 := bits.mul_u64(arg1[0], x6)
+	x34, x33 := bits.mul_u64(arg1[0], x7)
+	x36, x35 := bits.mul_u64(arg1[0], x8)
+	x38, x37 := bits.mul_u64(arg1[0], arg1[0])
+	x39, x40 := bits.add_u64(x21, x17, u64(0x0))
+	x41, _ := bits.add_u64(x22, x18, u64(fiat.u1(x40)))
+	x43, x44 := bits.add_u64(x37, x39, u64(0x0))
+	x45, _ := bits.add_u64(x38, x41, u64(fiat.u1(x44)))
+	x47 := ((x43 >> 51) | ((x45 << 13) & 0xffffffffffffffff))
+	x48 := (x43 & 0x7ffffffffffff)
+	x49, x50 := bits.add_u64(x23, x19, u64(0x0))
+	x51, _ := bits.add_u64(x24, x20, u64(fiat.u1(x50)))
+	x53, x54 := bits.add_u64(x29, x49, u64(0x0))
+	x55, _ := bits.add_u64(x30, x51, u64(fiat.u1(x54)))
+	x57, x58 := bits.add_u64(x25, x9, u64(0x0))
+	x59, _ := bits.add_u64(x26, x10, u64(fiat.u1(x58)))
+	x61, x62 := bits.add_u64(x31, x57, u64(0x0))
+	x63, _ := bits.add_u64(x32, x59, u64(fiat.u1(x62)))
+	x65, x66 := bits.add_u64(x27, x11, u64(0x0))
+	x67, _ := bits.add_u64(x28, x12, u64(fiat.u1(x66)))
+	x69, x70 := bits.add_u64(x33, x65, u64(0x0))
+	x71, _ := bits.add_u64(x34, x67, u64(fiat.u1(x70)))
+	x73, x74 := bits.add_u64(x15, x13, u64(0x0))
+	x75, _ := bits.add_u64(x16, x14, u64(fiat.u1(x74)))
+	x77, x78 := bits.add_u64(x35, x73, u64(0x0))
+	x79, _ := bits.add_u64(x36, x75, u64(fiat.u1(x78)))
+	x81, x82 := bits.add_u64(x47, x77, u64(0x0))
+	x83 := (u64(fiat.u1(x82)) + x79)
+	x84 := ((x81 >> 51) | ((x83 << 13) & 0xffffffffffffffff))
+	x85 := (x81 & 0x7ffffffffffff)
+	x86, x87 := bits.add_u64(x84, x69, u64(0x0))
+	x88 := (u64(fiat.u1(x87)) + x71)
+	x89 := ((x86 >> 51) | ((x88 << 13) & 0xffffffffffffffff))
+	x90 := (x86 & 0x7ffffffffffff)
+	x91, x92 := bits.add_u64(x89, x61, u64(0x0))
+	x93 := (u64(fiat.u1(x92)) + x63)
+	x94 := ((x91 >> 51) | ((x93 << 13) & 0xffffffffffffffff))
+	x95 := (x91 & 0x7ffffffffffff)
+	x96, x97 := bits.add_u64(x94, x53, u64(0x0))
+	x98 := (u64(fiat.u1(x97)) + x55)
+	x99 := ((x96 >> 51) | ((x98 << 13) & 0xffffffffffffffff))
+	x100 := (x96 & 0x7ffffffffffff)
+	x101 := (x99 * 0x13)
+	x102 := (x48 + x101)
+	x103 := (x102 >> 51)
+	x104 := (x102 & 0x7ffffffffffff)
+	x105 := (x103 + x85)
+	x106 := fiat.u1((x105 >> 51))
+	x107 := (x105 & 0x7ffffffffffff)
+	x108 := (u64(x106) + x90)
+	out1[0] = x104
+	out1[1] = x107
+	out1[2] = x108
+	out1[3] = x95
+	out1[4] = x100
+}
+
+fe_carry :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) {
+	x1 := arg1[0]
+	x2 := ((x1 >> 51) + arg1[1])
+	x3 := ((x2 >> 51) + arg1[2])
+	x4 := ((x3 >> 51) + arg1[3])
+	x5 := ((x4 >> 51) + arg1[4])
+	x6 := ((x1 & 0x7ffffffffffff) + ((x5 >> 51) * 0x13))
+	x7 := (u64(fiat.u1((x6 >> 51))) + (x2 & 0x7ffffffffffff))
+	x8 := (x6 & 0x7ffffffffffff)
+	x9 := (x7 & 0x7ffffffffffff)
+	x10 := (u64(fiat.u1((x7 >> 51))) + (x3 & 0x7ffffffffffff))
+	x11 := (x4 & 0x7ffffffffffff)
+	x12 := (x5 & 0x7ffffffffffff)
+	out1[0] = x8
+	out1[1] = x9
+	out1[2] = x10
+	out1[3] = x11
+	out1[4] = x12
+}
+
+fe_add :: proc "contextless" (out1: ^Loose_Field_Element, arg1, arg2: ^Tight_Field_Element) {
+	x1 := (arg1[0] + arg2[0])
+	x2 := (arg1[1] + arg2[1])
+	x3 := (arg1[2] + arg2[2])
+	x4 := (arg1[3] + arg2[3])
+	x5 := (arg1[4] + arg2[4])
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+	out1[4] = x5
+}
+
+fe_sub :: proc "contextless" (out1: ^Loose_Field_Element, arg1, arg2: ^Tight_Field_Element) {
+	x1 := ((0xfffffffffffda + arg1[0]) - arg2[0])
+	x2 := ((0xffffffffffffe + arg1[1]) - arg2[1])
+	x3 := ((0xffffffffffffe + arg1[2]) - arg2[2])
+	x4 := ((0xffffffffffffe + arg1[3]) - arg2[3])
+	x5 := ((0xffffffffffffe + arg1[4]) - arg2[4])
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+	out1[4] = x5
+}
+
+fe_opp :: proc "contextless" (out1: ^Loose_Field_Element, arg1: ^Tight_Field_Element) {
+	x1 := (0xfffffffffffda - arg1[0])
+	x2 := (0xffffffffffffe - arg1[1])
+	x3 := (0xffffffffffffe - arg1[2])
+	x4 := (0xffffffffffffe - arg1[3])
+	x5 := (0xffffffffffffe - arg1[4])
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+	out1[4] = x5
+}
+
+fe_cond_assign :: proc "contextless" (out1, arg1: ^Tight_Field_Element, arg2: int) {
+	x1 := fiat.cmovznz_u64(fiat.u1(arg2), out1[0], arg1[0])
+	x2 := fiat.cmovznz_u64(fiat.u1(arg2), out1[1], arg1[1])
+	x3 := fiat.cmovznz_u64(fiat.u1(arg2), out1[2], arg1[2])
+	x4 := fiat.cmovznz_u64(fiat.u1(arg2), out1[3], arg1[3])
+	x5 := fiat.cmovznz_u64(fiat.u1(arg2), out1[4], arg1[4])
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+	out1[4] = x5
+}
+
+fe_to_bytes :: proc "contextless" (out1: ^[32]byte, arg1: ^Tight_Field_Element) {
+	x1, x2 := _subborrowx_u51(0x0, arg1[0], 0x7ffffffffffed)
+	x3, x4 := _subborrowx_u51(x2, arg1[1], 0x7ffffffffffff)
+	x5, x6 := _subborrowx_u51(x4, arg1[2], 0x7ffffffffffff)
+	x7, x8 := _subborrowx_u51(x6, arg1[3], 0x7ffffffffffff)
+	x9, x10 := _subborrowx_u51(x8, arg1[4], 0x7ffffffffffff)
+	x11 := fiat.cmovznz_u64(x10, u64(0x0), 0xffffffffffffffff)
+	x12, x13 := _addcarryx_u51(0x0, x1, (x11 & 0x7ffffffffffed))
+	x14, x15 := _addcarryx_u51(x13, x3, (x11 & 0x7ffffffffffff))
+	x16, x17 := _addcarryx_u51(x15, x5, (x11 & 0x7ffffffffffff))
+	x18, x19 := _addcarryx_u51(x17, x7, (x11 & 0x7ffffffffffff))
+	x20, _ := _addcarryx_u51(x19, x9, (x11 & 0x7ffffffffffff))
+	x22 := (x20 << 4)
+	x23 := (x18 * u64(0x2))
+	x24 := (x16 << 6)
+	x25 := (x14 << 3)
+	x26 := (u8(x12) & 0xff)
+	x27 := (x12 >> 8)
+	x28 := (u8(x27) & 0xff)
+	x29 := (x27 >> 8)
+	x30 := (u8(x29) & 0xff)
+	x31 := (x29 >> 8)
+	x32 := (u8(x31) & 0xff)
+	x33 := (x31 >> 8)
+	x34 := (u8(x33) & 0xff)
+	x35 := (x33 >> 8)
+	x36 := (u8(x35) & 0xff)
+	x37 := u8((x35 >> 8))
+	x38 := (x25 + u64(x37))
+	x39 := (u8(x38) & 0xff)
+	x40 := (x38 >> 8)
+	x41 := (u8(x40) & 0xff)
+	x42 := (x40 >> 8)
+	x43 := (u8(x42) & 0xff)
+	x44 := (x42 >> 8)
+	x45 := (u8(x44) & 0xff)
+	x46 := (x44 >> 8)
+	x47 := (u8(x46) & 0xff)
+	x48 := (x46 >> 8)
+	x49 := (u8(x48) & 0xff)
+	x50 := u8((x48 >> 8))
+	x51 := (x24 + u64(x50))
+	x52 := (u8(x51) & 0xff)
+	x53 := (x51 >> 8)
+	x54 := (u8(x53) & 0xff)
+	x55 := (x53 >> 8)
+	x56 := (u8(x55) & 0xff)
+	x57 := (x55 >> 8)
+	x58 := (u8(x57) & 0xff)
+	x59 := (x57 >> 8)
+	x60 := (u8(x59) & 0xff)
+	x61 := (x59 >> 8)
+	x62 := (u8(x61) & 0xff)
+	x63 := (x61 >> 8)
+	x64 := (u8(x63) & 0xff)
+	x65 := fiat.u1((x63 >> 8))
+	x66 := (x23 + u64(x65))
+	x67 := (u8(x66) & 0xff)
+	x68 := (x66 >> 8)
+	x69 := (u8(x68) & 0xff)
+	x70 := (x68 >> 8)
+	x71 := (u8(x70) & 0xff)
+	x72 := (x70 >> 8)
+	x73 := (u8(x72) & 0xff)
+	x74 := (x72 >> 8)
+	x75 := (u8(x74) & 0xff)
+	x76 := (x74 >> 8)
+	x77 := (u8(x76) & 0xff)
+	x78 := u8((x76 >> 8))
+	x79 := (x22 + u64(x78))
+	x80 := (u8(x79) & 0xff)
+	x81 := (x79 >> 8)
+	x82 := (u8(x81) & 0xff)
+	x83 := (x81 >> 8)
+	x84 := (u8(x83) & 0xff)
+	x85 := (x83 >> 8)
+	x86 := (u8(x85) & 0xff)
+	x87 := (x85 >> 8)
+	x88 := (u8(x87) & 0xff)
+	x89 := (x87 >> 8)
+	x90 := (u8(x89) & 0xff)
+	x91 := u8((x89 >> 8))
+	out1[0] = x26
+	out1[1] = x28
+	out1[2] = x30
+	out1[3] = x32
+	out1[4] = x34
+	out1[5] = x36
+	out1[6] = x39
+	out1[7] = x41
+	out1[8] = x43
+	out1[9] = x45
+	out1[10] = x47
+	out1[11] = x49
+	out1[12] = x52
+	out1[13] = x54
+	out1[14] = x56
+	out1[15] = x58
+	out1[16] = x60
+	out1[17] = x62
+	out1[18] = x64
+	out1[19] = x67
+	out1[20] = x69
+	out1[21] = x71
+	out1[22] = x73
+	out1[23] = x75
+	out1[24] = x77
+	out1[25] = x80
+	out1[26] = x82
+	out1[27] = x84
+	out1[28] = x86
+	out1[29] = x88
+	out1[30] = x90
+	out1[31] = x91
+}
+
+_fe_from_bytes :: proc "contextless" (out1: ^Tight_Field_Element, arg1: ^[32]byte) {
+	x1 := (u64(arg1[31]) << 44)
+	x2 := (u64(arg1[30]) << 36)
+	x3 := (u64(arg1[29]) << 28)
+	x4 := (u64(arg1[28]) << 20)
+	x5 := (u64(arg1[27]) << 12)
+	x6 := (u64(arg1[26]) << 4)
+	x7 := (u64(arg1[25]) << 47)
+	x8 := (u64(arg1[24]) << 39)
+	x9 := (u64(arg1[23]) << 31)
+	x10 := (u64(arg1[22]) << 23)
+	x11 := (u64(arg1[21]) << 15)
+	x12 := (u64(arg1[20]) << 7)
+	x13 := (u64(arg1[19]) << 50)
+	x14 := (u64(arg1[18]) << 42)
+	x15 := (u64(arg1[17]) << 34)
+	x16 := (u64(arg1[16]) << 26)
+	x17 := (u64(arg1[15]) << 18)
+	x18 := (u64(arg1[14]) << 10)
+	x19 := (u64(arg1[13]) << 2)
+	x20 := (u64(arg1[12]) << 45)
+	x21 := (u64(arg1[11]) << 37)
+	x22 := (u64(arg1[10]) << 29)
+	x23 := (u64(arg1[9]) << 21)
+	x24 := (u64(arg1[8]) << 13)
+	x25 := (u64(arg1[7]) << 5)
+	x26 := (u64(arg1[6]) << 48)
+	x27 := (u64(arg1[5]) << 40)
+	x28 := (u64(arg1[4]) << 32)
+	x29 := (u64(arg1[3]) << 24)
+	x30 := (u64(arg1[2]) << 16)
+	x31 := (u64(arg1[1]) << 8)
+	x32 := arg1[0]
+	x33 := (x31 + u64(x32))
+	x34 := (x30 + x33)
+	x35 := (x29 + x34)
+	x36 := (x28 + x35)
+	x37 := (x27 + x36)
+	x38 := (x26 + x37)
+	x39 := (x38 & 0x7ffffffffffff)
+	x40 := u8((x38 >> 51))
+	x41 := (x25 + u64(x40))
+	x42 := (x24 + x41)
+	x43 := (x23 + x42)
+	x44 := (x22 + x43)
+	x45 := (x21 + x44)
+	x46 := (x20 + x45)
+	x47 := (x46 & 0x7ffffffffffff)
+	x48 := u8((x46 >> 51))
+	x49 := (x19 + u64(x48))
+	x50 := (x18 + x49)
+	x51 := (x17 + x50)
+	x52 := (x16 + x51)
+	x53 := (x15 + x52)
+	x54 := (x14 + x53)
+	x55 := (x13 + x54)
+	x56 := (x55 & 0x7ffffffffffff)
+	x57 := u8((x55 >> 51))
+	x58 := (x12 + u64(x57))
+	x59 := (x11 + x58)
+	x60 := (x10 + x59)
+	x61 := (x9 + x60)
+	x62 := (x8 + x61)
+	x63 := (x7 + x62)
+	x64 := (x63 & 0x7ffffffffffff)
+	x65 := u8((x63 >> 51))
+	x66 := (x6 + u64(x65))
+	x67 := (x5 + x66)
+	x68 := (x4 + x67)
+	x69 := (x3 + x68)
+	x70 := (x2 + x69)
+	x71 := (x1 + x70)
+	out1[0] = x39
+	out1[1] = x47
+	out1[2] = x56
+	out1[3] = x64
+	out1[4] = x71
+}
+
+fe_relax :: proc "contextless" (out1: ^Loose_Field_Element, arg1: ^Tight_Field_Element) {
+	x1 := arg1[0]
+	x2 := arg1[1]
+	x3 := arg1[2]
+	x4 := arg1[3]
+	x5 := arg1[4]
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+	out1[4] = x5
+}
+
+fe_carry_scmul_121666 :: proc (out1: ^Tight_Field_Element, arg1: ^Loose_Field_Element) {
+	x2, x1 := bits.mul_u64(0x1db42, arg1[4])
+	x4, x3 := bits.mul_u64(0x1db42, arg1[3])
+	x6, x5 := bits.mul_u64(0x1db42, arg1[2])
+	x8, x7 := bits.mul_u64(0x1db42, arg1[1])
+	x10, x9 := bits.mul_u64(0x1db42, arg1[0])
+	x11 := ((x9 >> 51) | ((x10 << 13) & 0xffffffffffffffff))
+	x12 := (x9 & 0x7ffffffffffff)
+	x13, x14 := bits.add_u64(x11, x7, u64(0x0))
+	x15 := (u64(fiat.u1(x14)) + x8)
+	x16 := ((x13 >> 51) | ((x15 << 13) & 0xffffffffffffffff))
+	x17 := (x13 & 0x7ffffffffffff)
+	x18, x19 := bits.add_u64(x16, x5, u64(0x0))
+	x20 := (u64(fiat.u1(x19)) + x6)
+	x21 := ((x18 >> 51) | ((x20 << 13) & 0xffffffffffffffff))
+	x22 := (x18 & 0x7ffffffffffff)
+	x23, x24 := bits.add_u64(x21, x3, u64(0x0))
+	x25 := (u64(fiat.u1(x24)) + x4)
+	x26 := ((x23 >> 51) | ((x25 << 13) & 0xffffffffffffffff))
+	x27 := (x23 & 0x7ffffffffffff)
+	x28, x29 := bits.add_u64(x26, x1, u64(0x0))
+	x30 := (u64(fiat.u1(x29)) + x2)
+	x31 := ((x28 >> 51) | ((x30 << 13) & 0xffffffffffffffff))
+	x32 := (x28 & 0x7ffffffffffff)
+	x33 := (x31 * 0x13)
+	x34 := (x12 + x33)
+	x35 := fiat.u1((x34 >> 51))
+	x36 := (x34 & 0x7ffffffffffff)
+	x37 := (u64(x35) + x17)
+	x38 := fiat.u1((x37 >> 51))
+	x39 := (x37 & 0x7ffffffffffff)
+	x40 := (u64(x38) + x22)
+	out1[0] = x36
+	out1[1] = x39
+	out1[2] = x40
+	out1[3] = x27
+	out1[4] = x32
+}
+
+// The following routines were added by hand, and do not come from fiat-crypto.
+
+fe_zero :: proc "contextless" (out1: ^Tight_Field_Element) {
+	out1[0] = 0
+	out1[1] = 0
+	out1[2] = 0
+	out1[3] = 0
+	out1[4] = 0
+}
+
+fe_one :: proc "contextless" (out1: ^Tight_Field_Element) {
+	out1[0] = 1
+	out1[1] = 0
+	out1[2] = 0
+	out1[3] = 0
+	out1[4] = 0
+}
+
+fe_set :: proc "contextless" (out1, arg1: ^Tight_Field_Element) {
+	x1 := arg1[0]
+	x2 := arg1[1]
+	x3 := arg1[2]
+	x4 := arg1[3]
+	x5 := arg1[4]
+	out1[0] = x1
+	out1[1] = x2
+	out1[2] = x3
+	out1[3] = x4
+	out1[4] = x5
+}
+
+fe_cond_swap :: proc "contextless" (out1, out2: ^Tight_Field_Element, arg1: int) {
+	mask := -u64(arg1)
+	x := (out1[0] ~ out2[0]) & mask
+	x1, y1 := out1[0] ~ x, out2[0] ~ x
+	x = (out1[1] ~ out2[1]) & mask
+	x2, y2 := out1[1] ~ x, out2[1] ~ x
+	x = (out1[2] ~ out2[2]) & mask
+	x3, y3 := out1[2] ~ x, out2[2] ~ x
+	x = (out1[3] ~ out2[3]) & mask
+	x4, y4 := out1[3] ~ x, out2[3] ~ x
+	x = (out1[4] ~ out2[4]) & mask
+	x5, y5 := out1[4] ~ x, out2[4] ~ x
+	out1[0], out2[0] = x1, y1
+	out1[1], out2[1] = x2, y2
+	out1[2], out2[2] = x3, y3
+	out1[3], out2[3] = x4, y4
+	out1[4], out2[4] = x5, y5
+}
@@ -0,0 +1,126 @@
+package x25519
+
+import field "core:crypto/_fiat/field_curve25519"
+import "core:mem"
+
+SCALAR_SIZE :: 32
+POINT_SIZE :: 32
+
+_BASE_POINT: [32]byte = {9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
+
+_scalar_bit :: #force_inline proc "contextless" (s: ^[32]byte, i: int) -> u8 {
+	if i < 0 {
+		return 0
+	}
+	return (s[i>>3] >> uint(i&7)) & 1
+}
+
+_scalarmult :: proc (out, scalar, point: ^[32]byte) {
+	// Montgomery pseduo-multiplication taken from Monocypher.
+
+	// computes the scalar product
+	x1: field.Tight_Field_Element = ---
+	field.fe_from_bytes(&x1, point)
+
+	// computes the actual scalar product (the result is in x2 and z2)
+	x2, x3, z2, z3: field.Tight_Field_Element =  ---, ---, ---, ---
+	t0, t1: field.Loose_Field_Element = ---, ---
+
+	// Montgomery ladder
+	// In projective coordinates, to avoid divisions: x = X / Z
+	// We don't care about the y coordinate, it's only 1 bit of information
+	field.fe_one(&x2) // "zero" point
+	field.fe_zero(&z2)
+	field.fe_set(&x3, &x1) // "one" point
+	field.fe_one(&z3)
+
+	swap: int
+	for pos := 255-1; pos >= 0; pos = pos - 1 	{
+		// constant time conditional swap before ladder step
+		b := int(_scalar_bit(scalar, pos))
+		swap ~= b // xor trick avoids swapping at the end of the loop
+		field.fe_cond_swap(&x2, &x3, swap)
+		field.fe_cond_swap(&z2, &z3, swap)
+		swap = b // anticipates one last swap after the loop
+
+		// Montgomery ladder step: replaces (P2, P3) by (P2*2, P2+P3)
+		// with differential addition
+		//
+		// Note: This deliberately omits reductions after add/sub operations
+		// if the result is only ever used as the input to a mul/square since
+		// the implementations of those can deal with non-reduced inputs.
+		//
+		// fe_tighten_cast is only used to store a fully reduced
+		// output in a Loose_Field_Element, or to provide such a
+		// Loose_Field_Element as a Tight_Field_Element argument.
+		field.fe_sub(&t0, &x3, &z3)
+		field.fe_sub(&t1, &x2, &z2)
+		field.fe_add(field.fe_relax_cast(&x2), &x2, &z2) // x2 - unreduced
+		field.fe_add(field.fe_relax_cast(&z2), &x3, &z3) // z2 - unreduced
+		field.fe_carry_mul(&z3, &t0, field.fe_relax_cast(&x2))
+		field.fe_carry_mul(&z2, field.fe_relax_cast(&z2), &t1) // z2 - reduced
+		field.fe_carry_square(field.fe_tighten_cast(&t0), &t1) // t0 - reduced
+		field.fe_carry_square(field.fe_tighten_cast(&t1), field.fe_relax_cast(&x2)) // t1 - reduced
+		field.fe_add(field.fe_relax_cast(&x3), &z3, &z2) // x3 - unreduced
+		field.fe_sub(field.fe_relax_cast(&z2), &z3, &z2) // z2 - unreduced
+		field.fe_carry_mul(&x2, &t1, &t0) // x2 - reduced
+		field.fe_sub(&t1, field.fe_tighten_cast(&t1), field.fe_tighten_cast(&t0)) // safe - t1/t0 is reduced
+		field.fe_carry_square(&z2, field.fe_relax_cast(&z2)) // z2 - reduced
+		field.fe_carry_scmul_121666(&z3, &t1)
+		field.fe_carry_square(&x3, field.fe_relax_cast(&x3)) // x3 - reduced
+		field.fe_add(&t0, field.fe_tighten_cast(&t0), &z3) // safe - t0 is reduced
+		field.fe_carry_mul(&z3, field.fe_relax_cast(&x1), field.fe_relax_cast(&z2))
+		field.fe_carry_mul(&z2, &t1, &t0)
+	}
+	// last swap is necessary to compensate for the xor trick
+	// Note: after this swap, P3 == P2 + P1.
+	field.fe_cond_swap(&x2, &x3, swap)
+	field.fe_cond_swap(&z2, &z3, swap)
+
+	// normalises the coordinates: x == X / Z
+	field.fe_carry_inv(&z2, field.fe_relax_cast(&z2))
+	field.fe_carry_mul(&x2, field.fe_relax_cast(&x2), field.fe_relax_cast(&z2))
+	field.fe_to_bytes(out, &x2)
+
+	mem.zero_explicit(&x1, size_of(x1))
+	mem.zero_explicit(&x2, size_of(x2))
+	mem.zero_explicit(&x3, size_of(x3))
+	mem.zero_explicit(&z2, size_of(z2))
+	mem.zero_explicit(&z3, size_of(z3))
+	mem.zero_explicit(&t0, size_of(t0))
+	mem.zero_explicit(&t1, size_of(t1))
+}
+
+scalarmult :: proc (dst, scalar, point: []byte) {
+	if len(scalar) != SCALAR_SIZE {
+		panic("crypto/x25519: invalid scalar size")
+	}
+	if len(point) != POINT_SIZE {
+		panic("crypto/x25519: invalid point size")
+	}
+	if len(dst) != POINT_SIZE {
+		panic("crypto/x25519: invalid destination point size")
+	}
+
+	// "clamp" the scalar
+	e: [32]byte = ---
+	copy_slice(e[:], scalar)
+	e[0] &= 248
+	e[31] &= 127
+	e[31] |= 64
+
+	p: [32]byte = ---
+	copy_slice(p[:], point)
+
+	d: [32]byte = ---
+	_scalarmult(&d, &e, &p)
+	copy_slice(dst, d[:])
+
+	mem.zero_explicit(&e, size_of(e))
+	mem.zero_explicit(&d, size_of(d))
+}
+
+scalarmult_basepoint :: proc (dst, scalar: []byte) {
+	// TODO/perf: Switch to using a precomputed table.
+	scalarmult(dst, scalar, _BASE_POINT[:])
+}
@@ -115,6 +115,11 @@ main :: proc() {
    test_haval_224(&t)
    test_haval_256(&t)

+    // "modern" crypto tests
+    test_x25519(&t)
+
+    bench_modern(&t)
+
    fmt.printf("%v/%v tests successful.\n", TEST_count - TEST_fail, TEST_count)
 }

@@ -0,0 +1,95 @@
+package test_core_crypto
+
+import "core:testing"
+import "core:fmt"
+import "core:time"
+
+import "core:crypto/x25519"
+
+_digit_value :: proc(r: rune) -> int {
+	ri := int(r)
+	v: int = 16
+	switch r {
+	case '0'..='9': v = ri-'0'
+	case 'a'..='z': v = ri-'a'+10
+	case 'A'..='Z': v = ri-'A'+10
+	}
+	return v
+}
+
+_decode_hex32 :: proc(s: string) -> [32]byte{
+	b: [32]byte
+	for i := 0; i < len(s); i = i + 2 {
+		hi := _digit_value(rune(s[i]))
+		lo := _digit_value(rune(s[i+1]))
+		b[i/2] = byte(hi << 4 | lo)
+	}
+	return b
+}
+
+TestECDH :: struct {
+	scalar:  string,
+	point:   string,
+	product: string,
+}
+
+@(test)
+test_x25519 :: proc(t: ^testing.T) {
+	log(t, "Testing X25519")
+
+	test_vectors := [?]TestECDH {
+		// Test vectors from RFC 7748
+		TestECDH{
+			"a546e36bf0527c9d3b16154b82465edd62144c0ac1fc5a18506a2244ba449ac4",
+			"e6db6867583030db3594c1a424b15f7c726624ec26b3353b10a903a6d0ab1c4c",
+			"c3da55379de9c6908e94ea4df28d084f32eccf03491c71f754b4075577a28552",
+		},
+		TestECDH{
+			"4b66e9d4d1b4673c5ad22691957d6af5c11b6421e0ea01d42ca4169e7918ba0d",
+			"e5210f12786811d3f4b7959d0538ae2c31dbe7106fc03c3efc4cd549c715a493",
+			"95cbde9476e8907d7aade45cb4b873f88b595a68799fa152e6f8f7647aac7957",
+		},
+	}
+	for v, _ in test_vectors {
+		scalar := _decode_hex32(v.scalar)
+		point := _decode_hex32(v.point)
+
+		derived_point: [x25519.POINT_SIZE]byte
+		x25519.scalarmult(derived_point[:], scalar[:], point[:])
+		derived_point_str := hex_string(derived_point[:])
+
+		expect(t, derived_point_str == v.product, fmt.tprintf("Expected %s for %s * %s, but got %s instead", v.product, v.scalar, v.point, derived_point_str))
+
+		// Abuse the test vectors to sanity-check the scalar-basepoint multiply.
+		p1, p2: [x25519.POINT_SIZE]byte
+		x25519.scalarmult_basepoint(p1[:], scalar[:])
+		x25519.scalarmult(p2[:], scalar[:], x25519._BASE_POINT[:])
+		p1_str, p2_str := hex_string(p1[:]), hex_string(p2[:])
+		expect(t, p1_str == p2_str, fmt.tprintf("Expected %s for %s * basepoint, but got %s instead", p2_str, v.scalar, p1_str))
+	}
+
+    // TODO/tests: Run the wycheproof test vectors, once I figure out
+    // how to work with JSON.
+}
+
+@(test)
+bench_modern :: proc(t: ^testing.T) {
+	fmt.println("Starting benchmarks:")
+
+	bench_x25519(t)
+}
+
+bench_x25519 :: proc(t: ^testing.T) {
+	point := _decode_hex32("deadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeef")
+	scalar := _decode_hex32("cafebabecafebabecafebabecafebabecafebabecafebabecafebabecafebabe")
+	out: [x25519.POINT_SIZE]byte = ---
+
+	iters :: 10000
+	start := time.now()
+	for i := 0; i < iters; i = i + 1 {
+		x25519.scalarmult(out[:], scalar[:], point[:])
+	}
+	elapsed := time.since(start)
+
+	log(t, fmt.tprintf("x25519.scalarmult: ~%f us/op", time.duration_microseconds(elapsed) / iters))
+}