mirror of
https://github.com/Ed94/Odin.git
synced 2026-06-17 03:12:22 -07:00
261 lines
6.7 KiB
Odin
261 lines
6.7 KiB
Odin
// Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
|
|
// All rights reserved.
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions
|
|
// are met:
|
|
//
|
|
// 1. Redistributions of source code must retain the above copyright
|
|
// notice, this list of conditions and the following disclaimer.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY THE AUTHORS “AS IS” AND ANY EXPRESS OR
|
|
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
// ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
|
|
// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
|
// GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
|
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
package aes_ct64
|
|
|
|
// Bitsliced AES for 64-bit general purpose (integer) registers. Each
|
|
// invocation will process up to 4 blocks at a time. This implementation
|
|
// is derived from the BearSSL ct64 code, and distributed under a 1-clause
|
|
// BSD license with permission from the original author.
|
|
//
|
|
// WARNING: "hic sunt dracones"
|
|
//
|
|
// This package also deliberately exposes enough internals to be able to
|
|
// function as a replacement for `AESENC` and `AESDEC` from AES-NI, to
|
|
// allow the implementation of non-AES primitives that use the AES round
|
|
// function such as AEGIS and Deoxys-II. This should ONLY be done when
|
|
// implementing something other than AES itself.
|
|
|
|
sub_bytes :: proc "contextless" (q: ^[8]u64) {
|
|
// This S-box implementation is a straightforward translation of
|
|
// the circuit described by Boyar and Peralta in "A new
|
|
// combinational logic minimization technique with applications
|
|
// to cryptology" (https://eprint.iacr.org/2009/191.pdf).
|
|
//
|
|
// Note that variables x* (input) and s* (output) are numbered
|
|
// in "reverse" order (x0 is the high bit, x7 is the low bit).
|
|
|
|
x0 := q[7]
|
|
x1 := q[6]
|
|
x2 := q[5]
|
|
x3 := q[4]
|
|
x4 := q[3]
|
|
x5 := q[2]
|
|
x6 := q[1]
|
|
x7 := q[0]
|
|
|
|
// Top linear transformation.
|
|
y14 := x3 ~ x5
|
|
y13 := x0 ~ x6
|
|
y9 := x0 ~ x3
|
|
y8 := x0 ~ x5
|
|
t0 := x1 ~ x2
|
|
y1 := t0 ~ x7
|
|
y4 := y1 ~ x3
|
|
y12 := y13 ~ y14
|
|
y2 := y1 ~ x0
|
|
y5 := y1 ~ x6
|
|
y3 := y5 ~ y8
|
|
t1 := x4 ~ y12
|
|
y15 := t1 ~ x5
|
|
y20 := t1 ~ x1
|
|
y6 := y15 ~ x7
|
|
y10 := y15 ~ t0
|
|
y11 := y20 ~ y9
|
|
y7 := x7 ~ y11
|
|
y17 := y10 ~ y11
|
|
y19 := y10 ~ y8
|
|
y16 := t0 ~ y11
|
|
y21 := y13 ~ y16
|
|
y18 := x0 ~ y16
|
|
|
|
// Non-linear section.
|
|
t2 := y12 & y15
|
|
t3 := y3 & y6
|
|
t4 := t3 ~ t2
|
|
t5 := y4 & x7
|
|
t6 := t5 ~ t2
|
|
t7 := y13 & y16
|
|
t8 := y5 & y1
|
|
t9 := t8 ~ t7
|
|
t10 := y2 & y7
|
|
t11 := t10 ~ t7
|
|
t12 := y9 & y11
|
|
t13 := y14 & y17
|
|
t14 := t13 ~ t12
|
|
t15 := y8 & y10
|
|
t16 := t15 ~ t12
|
|
t17 := t4 ~ t14
|
|
t18 := t6 ~ t16
|
|
t19 := t9 ~ t14
|
|
t20 := t11 ~ t16
|
|
t21 := t17 ~ y20
|
|
t22 := t18 ~ y19
|
|
t23 := t19 ~ y21
|
|
t24 := t20 ~ y18
|
|
|
|
t25 := t21 ~ t22
|
|
t26 := t21 & t23
|
|
t27 := t24 ~ t26
|
|
t28 := t25 & t27
|
|
t29 := t28 ~ t22
|
|
t30 := t23 ~ t24
|
|
t31 := t22 ~ t26
|
|
t32 := t31 & t30
|
|
t33 := t32 ~ t24
|
|
t34 := t23 ~ t33
|
|
t35 := t27 ~ t33
|
|
t36 := t24 & t35
|
|
t37 := t36 ~ t34
|
|
t38 := t27 ~ t36
|
|
t39 := t29 & t38
|
|
t40 := t25 ~ t39
|
|
|
|
t41 := t40 ~ t37
|
|
t42 := t29 ~ t33
|
|
t43 := t29 ~ t40
|
|
t44 := t33 ~ t37
|
|
t45 := t42 ~ t41
|
|
z0 := t44 & y15
|
|
z1 := t37 & y6
|
|
z2 := t33 & x7
|
|
z3 := t43 & y16
|
|
z4 := t40 & y1
|
|
z5 := t29 & y7
|
|
z6 := t42 & y11
|
|
z7 := t45 & y17
|
|
z8 := t41 & y10
|
|
z9 := t44 & y12
|
|
z10 := t37 & y3
|
|
z11 := t33 & y4
|
|
z12 := t43 & y13
|
|
z13 := t40 & y5
|
|
z14 := t29 & y2
|
|
z15 := t42 & y9
|
|
z16 := t45 & y14
|
|
z17 := t41 & y8
|
|
|
|
// Bottom linear transformation.
|
|
t46 := z15 ~ z16
|
|
t47 := z10 ~ z11
|
|
t48 := z5 ~ z13
|
|
t49 := z9 ~ z10
|
|
t50 := z2 ~ z12
|
|
t51 := z2 ~ z5
|
|
t52 := z7 ~ z8
|
|
t53 := z0 ~ z3
|
|
t54 := z6 ~ z7
|
|
t55 := z16 ~ z17
|
|
t56 := z12 ~ t48
|
|
t57 := t50 ~ t53
|
|
t58 := z4 ~ t46
|
|
t59 := z3 ~ t54
|
|
t60 := t46 ~ t57
|
|
t61 := z14 ~ t57
|
|
t62 := t52 ~ t58
|
|
t63 := t49 ~ t58
|
|
t64 := z4 ~ t59
|
|
t65 := t61 ~ t62
|
|
t66 := z1 ~ t63
|
|
s0 := t59 ~ t63
|
|
s6 := t56 ~ ~t62
|
|
s7 := t48 ~ ~t60
|
|
t67 := t64 ~ t65
|
|
s3 := t53 ~ t66
|
|
s4 := t51 ~ t66
|
|
s5 := t47 ~ t65
|
|
s1 := t64 ~ ~s3
|
|
s2 := t55 ~ ~t67
|
|
|
|
q[7] = s0
|
|
q[6] = s1
|
|
q[5] = s2
|
|
q[4] = s3
|
|
q[3] = s4
|
|
q[2] = s5
|
|
q[1] = s6
|
|
q[0] = s7
|
|
}
|
|
|
|
orthogonalize :: proc "contextless" (q: ^[8]u64) {
|
|
CL2 :: 0x5555555555555555
|
|
CH2 :: 0xAAAAAAAAAAAAAAAA
|
|
q[0], q[1] = (q[0] & CL2) | ((q[1] & CL2) << 1), ((q[0] & CH2) >> 1) | (q[1] & CH2)
|
|
q[2], q[3] = (q[2] & CL2) | ((q[3] & CL2) << 1), ((q[2] & CH2) >> 1) | (q[3] & CH2)
|
|
q[4], q[5] = (q[4] & CL2) | ((q[5] & CL2) << 1), ((q[4] & CH2) >> 1) | (q[5] & CH2)
|
|
q[6], q[7] = (q[6] & CL2) | ((q[7] & CL2) << 1), ((q[6] & CH2) >> 1) | (q[7] & CH2)
|
|
|
|
CL4 :: 0x3333333333333333
|
|
CH4 :: 0xCCCCCCCCCCCCCCCC
|
|
q[0], q[2] = (q[0] & CL4) | ((q[2] & CL4) << 2), ((q[0] & CH4) >> 2) | (q[2] & CH4)
|
|
q[1], q[3] = (q[1] & CL4) | ((q[3] & CL4) << 2), ((q[1] & CH4) >> 2) | (q[3] & CH4)
|
|
q[4], q[6] = (q[4] & CL4) | ((q[6] & CL4) << 2), ((q[4] & CH4) >> 2) | (q[6] & CH4)
|
|
q[5], q[7] = (q[5] & CL4) | ((q[7] & CL4) << 2), ((q[5] & CH4) >> 2) | (q[7] & CH4)
|
|
|
|
CL8 :: 0x0F0F0F0F0F0F0F0F
|
|
CH8 :: 0xF0F0F0F0F0F0F0F0
|
|
q[0], q[4] = (q[0] & CL8) | ((q[4] & CL8) << 4), ((q[0] & CH8) >> 4) | (q[4] & CH8)
|
|
q[1], q[5] = (q[1] & CL8) | ((q[5] & CL8) << 4), ((q[1] & CH8) >> 4) | (q[5] & CH8)
|
|
q[2], q[6] = (q[2] & CL8) | ((q[6] & CL8) << 4), ((q[2] & CH8) >> 4) | (q[6] & CH8)
|
|
q[3], q[7] = (q[3] & CL8) | ((q[7] & CL8) << 4), ((q[3] & CH8) >> 4) | (q[7] & CH8)
|
|
}
|
|
|
|
@(require_results)
|
|
interleave_in :: proc "contextless" (w0, w1, w2, w3: u32) -> (q0, q1: u64) #no_bounds_check {
|
|
x0, x1, x2, x3 := u64(w0), u64(w1), u64(w2), u64(w3)
|
|
x0 |= (x0 << 16)
|
|
x1 |= (x1 << 16)
|
|
x2 |= (x2 << 16)
|
|
x3 |= (x3 << 16)
|
|
x0 &= 0x0000FFFF0000FFFF
|
|
x1 &= 0x0000FFFF0000FFFF
|
|
x2 &= 0x0000FFFF0000FFFF
|
|
x3 &= 0x0000FFFF0000FFFF
|
|
x0 |= (x0 << 8)
|
|
x1 |= (x1 << 8)
|
|
x2 |= (x2 << 8)
|
|
x3 |= (x3 << 8)
|
|
x0 &= 0x00FF00FF00FF00FF
|
|
x1 &= 0x00FF00FF00FF00FF
|
|
x2 &= 0x00FF00FF00FF00FF
|
|
x3 &= 0x00FF00FF00FF00FF
|
|
q0 = x0 | (x2 << 8)
|
|
q1 = x1 | (x3 << 8)
|
|
return
|
|
}
|
|
|
|
@(require_results)
|
|
interleave_out :: proc "contextless" (q0, q1: u64) -> (w0, w1, w2, w3: u32) {
|
|
x0 := q0 & 0x00FF00FF00FF00FF
|
|
x1 := q1 & 0x00FF00FF00FF00FF
|
|
x2 := (q0 >> 8) & 0x00FF00FF00FF00FF
|
|
x3 := (q1 >> 8) & 0x00FF00FF00FF00FF
|
|
x0 |= (x0 >> 8)
|
|
x1 |= (x1 >> 8)
|
|
x2 |= (x2 >> 8)
|
|
x3 |= (x3 >> 8)
|
|
x0 &= 0x0000FFFF0000FFFF
|
|
x1 &= 0x0000FFFF0000FFFF
|
|
x2 &= 0x0000FFFF0000FFFF
|
|
x3 &= 0x0000FFFF0000FFFF
|
|
w0 = u32(x0) | u32(x0 >> 16)
|
|
w1 = u32(x1) | u32(x1 >> 16)
|
|
w2 = u32(x2) | u32(x2 >> 16)
|
|
w3 = u32(x3) | u32(x3 >> 16)
|
|
return
|
|
}
|
|
|
|
@(private)
|
|
rotr32 :: #force_inline proc "contextless" (x: u64) -> u64 {
|
|
return (x << 32) | (x >> 32)
|
|
}
|