update third_party/martins_hash with latest code

This commit is contained in:
Mārtiņš Možeiko
2025-10-19 19:09:33 -07:00
committed by Ryan Fleury
parent 4bc6240ac0
commit 6589cbe374
4 changed files with 842 additions and 250 deletions
+4 -4
View File
@@ -46,9 +46,9 @@ static inline void md5_finish(md5_ctx* ctx, uint8_t digest[MD5_DIGEST_SIZE]);
#endif
#if defined(_MSC_VER)
# define MD5_GET32LE(ptr) *((const _UNALIGNED uint32_t*)(ptr))
# define MD5_SET32LE(ptr,x) *((_UNALIGNED uint32_t*)(ptr)) = (x)
# define MD5_SET64LE(ptr,x) *((_UNALIGNED uint64_t*)(ptr)) = (x)
# define MD5_GET32LE(ptr) *((const __unaligned uint32_t*)(ptr))
# define MD5_SET32LE(ptr,x) *((__unaligned uint32_t*)(ptr)) = (x)
# define MD5_SET64LE(ptr,x) *((__unaligned uint64_t*)(ptr)) = (x)
#else
# define MD5_GET32LE(ptr) \
( \
@@ -431,5 +431,5 @@ void md5_finish(md5_ctx* ctx, uint8_t digest[MD5_DIGEST_SIZE])
}
#if defined(__clang__)
#pragma clang diagnostic pop
# pragma clang diagnostic pop
#endif
+232 -17
View File
@@ -50,9 +50,9 @@ static inline void sha1_finish(sha1_ctx* ctx, uint8_t digest[SHA1_DIGEST_SIZE]);
#if defined(_MSC_VER)
# include <stdlib.h>
# define SHA1_GET32BE(ptr) _byteswap_ulong( *((const _UNALIGNED uint32_t*)(ptr)) )
# define SHA1_SET32BE(ptr,x) *((_UNALIGNED uint32_t*)(ptr)) = _byteswap_ulong(x)
# define SHA1_SET64BE(ptr,x) *((_UNALIGNED uint64_t*)(ptr)) = _byteswap_uint64(x)
# define SHA1_GET32BE(ptr) _byteswap_ulong( *((const __unaligned uint32_t*)(ptr)) )
# define SHA1_SET32BE(ptr,x) *((__unaligned uint32_t*)(ptr)) = _byteswap_ulong(x)
# define SHA1_SET64BE(ptr,x) *((__unaligned uint64_t*)(ptr)) = _byteswap_uint64(x)
#else
# define SHA1_GET32BE(ptr) \
( \
@@ -137,36 +137,86 @@ static inline int sha1_cpuid(void)
SHA1_TARGET("ssse3,sha")
static void sha1_process_shani(uint32_t* state, const uint8_t* block, size_t count)
{
const __m128i* buffer = (const __m128i*)block;
// in SHA1 each round has two parts:
// 1) calculate message schedule dwords in w[i]
// 2) do round functions to update a/b/c/d/e state values using w[i]
// for performing two operations in one:
// 1) dwords need to be loaded as big-endian
// 2) order of dwords need to be reversed for sha instructions: [0,1,2,3] -> [3,2,1,0]
const __m128i bswap = _mm_setr_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
// w[i] in first 16 rounds is just loaded from block bytes, as 32-bit big-endian load
// for next rounds it is done as:
// w[i] = ROL(w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16])
// where ROL(x) = 32-bit rotate left by 1
// this means it is possible to keep just the last 16 of w's in circular buffer
// and every new w calculated will need to update 1 to 3 previous w's
// unrolling round calculations by 4 we get:
// w[i+0] = ROL(w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16])
// w[i+1] = ROL(w[i-2] ^ w[i-7] ^ w[i-13] ^ w[i-15])
// w[i+2] = ROL(w[i-1] ^ w[i-6] ^ w[i-12] ^ w[i-14])
// w[i+3] = ROL(w[i+0] ^ w[i-5] ^ w[i-11] ^ w[i-13])
// now if you store 4 w[..] values in 128-bit SSE register, then
// W(i) = ROL( r0 ^ r1 ^ r2 ^ r3 )
// with caveat that r0 lane 3 depends on W(i) lane 0
// [3] [2] [1] [0] // lanes
// r0 = [ special, w[i-1], w[i-2], w[i-3] ]
// r1 = [ w[i-5], w[i-6], w[i-7], w[i-8] ]
// r2 = [ w[i-11], w[i-12], w[i-13], w[i-14] ]
// r3 = [ w[i-13], w[i-14], w[i-15], w[i-16] ]
// in each 4-round i'th step it is possible to incrementally update new W(..) value when
// keeping W(i) values in 4 xmm element circular buffer
// rounds i>0: W(i-1) = r2 ^ r3 = _mm_sha1msg1_epu32(W(i-1), W(i))
// rounds i>1: W(i-2) = W(i-2) ^ r1 = _mm_xor_si128 (W(i-2), W(i))
// rounds i>2: W(i-3) = ROL(W(i-3) ^ r0) = _mm_sha1msg2_epu32(W(i-3), W(i))
// then the new W(i) can be used in round function calculations
// _mm_sha1msg2_epu32 correctly handles r0 lane 3 dependency on W(i) lane 0
// to perform round functions on two SIMD registers with state as:
// abcd = [a,b,c,d]
// e0 = [e,0,0,0]
// use the following code to get next abcd/e0 state 4 rounds at a time:
// tmp = _mm_sha1nexte_epu32(e0, W(i)) // rotates e0 and adds message dwords
// abcd_next = _mm_sha1rnds4_epu32(abcd, tmp, Fn) // with Fn = 0..3 round function selection
// e0_next = abcd
// sha1nexte is not needed on first round, just regular add32(e0, W(i)) should be used
// after last round need to do extra rotation, which sha1nexte takes care when adding to last_e0
#define W(i) w[(i)%4]
// 4 wide round calculations
#define QROUND(i) do { \
/* first four rounds loads input message */ \
/* first 4 rounds load input block */ \
if (i < 4) W(i) = _mm_shuffle_epi8(_mm_loadu_si128(&buffer[i]), bswap); \
/* update previous message dwords for next rounds */ \
/* update message schedule */ \
if (i > 0 && i < 17) W(i-1) = _mm_sha1msg1_epu32(W(i-1), W(i)); \
if (i > 1 && i < 18) W(i-2) = _mm_xor_si128(W(i-2), W(i)); \
if (i > 1 && i < 18) W(i-2) = _mm_xor_si128 (W(i-2), W(i)); \
if (i > 2 && i < 19) W(i-3) = _mm_sha1msg2_epu32(W(i-3), W(i)); \
/* calculate E from message dwords */ \
if (i == 0) tmp = _mm_add_epi32(e0, W(i)); \
/* calculate E plus message schedule */ \
if (i == 0) tmp = _mm_add_epi32 (e0, W(i)); \
if (i != 0) tmp = _mm_sha1nexte_epu32(e0, W(i)); \
/* round function */ \
/* 4 round functions */ \
e0 = abcd; \
abcd = _mm_sha1rnds4_epu32(abcd, tmp, (i/5)%4); \
abcd = _mm_sha1rnds4_epu32(abcd, tmp, i/5); \
} while(0)
const __m128i* buffer = (const __m128i*)block;
// for performing two operations in one:
// 1) dwords need to be loaded as big-endian
// 2) order of dwords need to be reversed for sha1 instructions: [0,1,2,3] -> [3,2,1,0]
const __m128i bswap = _mm_setr_epi8(15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0);
// load initial state
__m128i abcd = _mm_loadu_si128((const __m128i*)state); // [d,c,b,a]
__m128i e0 = _mm_loadu_si32(&state[4]); // [0,0,0,e]
// change dword order
// flip dword order, to what sha1 instructions use
abcd = _mm_shuffle_epi32(abcd, _MM_SHUFFLE(0,1,2,3)); // [a,b,c,d] where a is in the top lane
e0 = _mm_slli_si128(e0, 12); // [e,0,0,0] where e is in top lane
@@ -183,16 +233,19 @@ static void sha1_process_shani(uint32_t* state, const uint8_t* block, size_t cou
QROUND(2);
QROUND(3);
QROUND(4);
QROUND(5);
QROUND(6);
QROUND(7);
QROUND(8);
QROUND(9);
QROUND(10);
QROUND(11);
QROUND(12);
QROUND(13);
QROUND(14);
QROUND(15);
QROUND(16);
QROUND(17);
@@ -221,6 +274,159 @@ static void sha1_process_shani(uint32_t* state, const uint8_t* block, size_t cou
#endif // defined(__x86_64__) || defined(_M_AMD64)
#if defined(__aarch64__) || defined(_M_ARM64)
#if defined(__clang__)
# define SHA1_TARGET __attribute__((target("sha2")))
#elif defined(__GNUC__)
# define SHA1_TARGET __attribute__((target("+sha2")))
#elif defined(_MSC_VER)
# define SHA1_TARGET
#endif
#include <arm_neon.h>
#if defined(_WIN32)
# include <windows.h>
#elif defined(__linux__)
# include <sys/auxv.h>
# include <asm/hwcap.h>
#elif defined(__APPLE__)
# include <sys/sysctl.h>
#endif
#define SHA1_CPUID_INIT (1 << 0)
#define SHA1_CPUID_ARM64 (1 << 1)
static inline int sha1_cpuid(void)
{
#if defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_SHA2)
int result = SHA1_CPUID_ARM64;
#else
static int cpuid;
int result = cpuid;
if (result == 0)
{
#if defined(_WIN32)
int has_arm64 = IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
#elif defined(__linux__)
unsigned long hwcap = getauxval(AT_HWCAP);
int has_arm64 = hwcap & HWCAP_SHA1;
#elif defined(__APPLE__)
int value = 0;
size_t valuelen = sizeof(value);
int has_arm64 = sysctlbyname("hw.optional.arm.FEAT_SHA1", &value, &valuelen, NULL, 0) == 0 && value != 0;
#else
#error unknown platform
#endif
result |= SHA1_CPUID_INIT;
if (has_arm64)
{
result |= SHA1_CPUID_ARM64;
}
cpuid = result;
}
#endif
#if defined(SHA1_CPUID_MASK)
result &= SHA1_CPUID_MASK;
#endif
return result;
}
SHA1_TARGET
static void sha1_process_arm64(uint32_t* state, const uint8_t* block, size_t count)
{
// code here is similar to x64 shani implementation
// message array is 16 element circular buffer
// each iteration updates 4 rounds at the same time
#define W(i) w[(i)%4]
#define QROUND(i,F,k) do { \
/* update message schedule */ \
if (i >= 4) W(i) = vsha1su0q_u32(W(i), W(i-3), W(i-2)); \
if (i >= 4) W(i) = vsha1su1q_u32(W(i), W(i-1)); \
/* add round constant */ \
uint32x4_t tmp = vaddq_u32(W(i), k); \
/* 4 round functions */ \
uint32_t x = e0; \
e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0)); \
abcd = F(abcd, x, tmp); \
} while (0)
const uint32x4_t k0 = vdupq_n_u32(0x5a827999);
const uint32x4_t k1 = vdupq_n_u32(0x6ed9eba1);
const uint32x4_t k2 = vdupq_n_u32(0x8f1bbcdc);
const uint32x4_t k3 = vdupq_n_u32(0xca62c1d6);
// load state - a,b,c,d,e
uint32x4_t abcd = vld1q_u32(state);
uint32_t e0 = state[4];
do
{
// remember current state
uint32x4_t last_abcd = abcd;
uint32_t last_e0 = e0;
// load 64-byte block and advance pointer to next block
uint8x16x4_t msg = vld1q_u8_x4(block);
block += SHA1_BLOCK_SIZE;
uint32x4_t w[4];
// for first 16 w's reverse the byte order in each 32-bit lane
W(0) = vreinterpretq_u32_u8(vrev32q_u8(msg.val[0]));
W(1) = vreinterpretq_u32_u8(vrev32q_u8(msg.val[1]));
W(2) = vreinterpretq_u32_u8(vrev32q_u8(msg.val[2]));
W(3) = vreinterpretq_u32_u8(vrev32q_u8(msg.val[3]));
QROUND( 0, vsha1cq_u32, k0);
QROUND( 1, vsha1cq_u32, k0);
QROUND( 2, vsha1cq_u32, k0);
QROUND( 3, vsha1cq_u32, k0);
QROUND( 4, vsha1cq_u32, k0);
QROUND( 5, vsha1pq_u32, k1);
QROUND( 6, vsha1pq_u32, k1);
QROUND( 7, vsha1pq_u32, k1);
QROUND( 8, vsha1pq_u32, k1);
QROUND( 9, vsha1pq_u32, k1);
QROUND(10, vsha1mq_u32, k2);
QROUND(11, vsha1mq_u32, k2);
QROUND(12, vsha1mq_u32, k2);
QROUND(13, vsha1mq_u32, k2);
QROUND(14, vsha1mq_u32, k2);
QROUND(15, vsha1pq_u32, k3);
QROUND(16, vsha1pq_u32, k3);
QROUND(17, vsha1pq_u32, k3);
QROUND(18, vsha1pq_u32, k3);
QROUND(19, vsha1pq_u32, k3);
// update next state
abcd = vaddq_u32(abcd, last_abcd);
e0 += last_e0;
}
while (--count);
// save state
vst1q_u32(state, abcd);
state[4] = e0;
#undef QROUND
#undef W
}
#endif // defined(__aarch64__) || defined(_M_ARM64)
static void sha1_process(uint32_t* state, const uint8_t* block, size_t count)
{
#if defined(__x86_64__) || defined(_M_AMD64)
@@ -232,12 +438,21 @@ static void sha1_process(uint32_t* state, const uint8_t* block, size_t count)
}
#endif
#if defined(__aarch64__) || defined(_M_ARM64)
int cpuid = sha1_cpuid();
if (cpuid & SHA1_CPUID_ARM64)
{
sha1_process_arm64(state, block, count);
return;
}
#endif
#define F1(x,y,z) (0x5a827999 + ((x & (y ^ z)) ^ z))
#define F2(x,y,z) (0x6ed9eba1 + (x ^ y ^ z))
#define F3(x,y,z) (0x8f1bbcdc + ((x & y) | (z & (x | y))))
#define F4(x,y,z) (0xca62c1d6 + (x ^ y ^ z))
#define W(i) w[(i+16)%16]
#define W(i) w[(i)%16]
#define ROUND(i,a,b,c,d,e,F) do \
{ \
+288 -108
View File
@@ -58,9 +58,9 @@ static inline void sha224_finish(sha224_ctx* ctx, uint8_t digest[SHA224_DIGEST_S
#if defined(_MSC_VER)
# include <stdlib.h>
# define SHA256_GET32BE(ptr) _byteswap_ulong( *((const _UNALIGNED uint32_t*)(ptr)) )
# define SHA256_SET32BE(ptr,x) *((_UNALIGNED uint32_t*)(ptr)) = _byteswap_ulong(x)
# define SHA256_SET64BE(ptr,x) *((_UNALIGNED uint64_t*)(ptr)) = _byteswap_uint64(x)
# define SHA256_GET32BE(ptr) _byteswap_ulong( *((const __unaligned uint32_t*)(ptr)) )
# define SHA256_SET32BE(ptr,x) *((__unaligned uint32_t*)(ptr)) = _byteswap_ulong(x)
# define SHA256_SET64BE(ptr,x) *((__unaligned uint64_t*)(ptr)) = _byteswap_uint64(x)
#else
# define SHA256_GET32BE(ptr) \
( \
@@ -91,6 +91,26 @@ static inline void sha224_finish(sha224_ctx* ctx, uint8_t digest[SHA224_DIGEST_S
while (0)
#endif
static const uint32_t SHA256_K[64] =
{
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
};
#if defined(__x86_64__) || defined(_M_AMD64)
#include <tmmintrin.h> // SSSE3
@@ -145,47 +165,64 @@ static inline int sha256_cpuid(void)
SHA256_TARGET("ssse3,sha")
static void sha256_process_shani(uint32_t* state, const uint8_t* block, size_t count)
{
const __m128i* buffer = (const __m128i*)block;
// similar way how sha1 works in with shani
// to byteswap when doing big-ending load for message dwords
const __m128i bswap = _mm_setr_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12);
// first 16 rounds loads message schedule dwords as 32-bit big endian values
static const uint32_t K[16][4] =
{
{ 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 },
{ 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 },
{ 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 },
{ 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 },
{ 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc },
{ 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da },
{ 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 },
{ 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 },
{ 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 },
{ 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 },
{ 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 },
{ 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 },
{ 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 },
{ 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 },
{ 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 },
{ 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 },
};
// for next rounds message schedule is prepared as:
// w[i] = SSig1(w[i-2]) + w[i-7] + SSig0(w[i-15]) + w[i-16]
// unrolled by 4:
// w[i+0] = SSig1(w[i-2]) + w[i-7] + SSig0(w[i-15]) + w[i-16]
// w[i+1] = SSig1(w[i-1]) + w[i-6] + SSig0(w[i-14]) + w[i-15]
// w[i+2] = SSig1(w[i+0]) + w[i-5] + SSig0(w[i-13]) + w[i-14]
// w[i+3] = SSig1(w[i+1]) + w[i-4] + SSig0(w[i-12]) + w[i-13]
// there is tricky dependency for lanes 2 and 3 on result of lanes 0 and 1, but sha256msg2 op takes care of that
// by storing W[i] word in 128-bit simd register, the message schedule becomes:
// W(i) = SSig1(r0) + r1 + SSig0(r2) + r3
// where + is 32-bit lane addition
// [3] [2] [1] [0] // lanes
// r0 = [ special, special, w[i-1], w[i-2] ]
// r1 = [ w[i-4], w[i-5], w[i-6], w[i-7] ]
// r2 = [ w[i-12], w[i-13], w[i-14], w[i-15] ]
// r3 = [ w[i-13], w[i-14], w[i-15], w[i-16] ]
// rN's can be calculated from previous W(..) values:
// r0 from W(i)
// r1 from _mm_alignr_epi8(W(i), W(i-1), 4)
// r2 from W(i-1) and W(i)
// r3 from W(i-1)
// rounds i>2: W(i-3) = _mm_sha256msg2_epu32(_mm_add_epi32( W(i-3), _mm_alignr_epi8(W(i), W(i-1), 4) ), W(i))
// rounds i>0: W(i-1) = _mm_sha256msg1_epu32(W(i-1), W(i))
// round functions are done with _mm_sha256rnds2_epu32 which performs it for 2 rounds
// thus repeat it two times, as input use W(i) + K(i) - message schedule added with sha256 constants
#define W(i) w[(i)%4]
// 4 wide round calculations
#define QROUND(i) do { \
/* first four rounds loads input message */ \
/* first 4 rounds load input block */ \
if (i < 4) W(i) = _mm_shuffle_epi8(_mm_loadu_si128(&buffer[i]), bswap); \
/* add round constant */ \
tmp = _mm_add_epi32(W(i), _mm_loadu_si128((const __m128i*)K[i])); \
/* update previous message dwords for next rounds */ \
/* update message schedule */ \
if (i > 2 && i < 15) W(i-3) = _mm_sha256msg2_epu32(_mm_add_epi32(W(i-3), _mm_alignr_epi8(W(i), W(i-1), 4)), W(i)); \
if (i > 0 && i < 13) W(i-1) = _mm_sha256msg1_epu32(W(i-1), W(i)); \
/* round functions */ \
/* add round constants */ \
__m128i tmp = _mm_add_epi32(W(i), _mm_loadu_si128((const __m128i*)&SHA256_K[4*i])); \
/* 4 round functions */ \
state1 = _mm_sha256rnds2_epu32(state1, state0, tmp); \
state0 = _mm_sha256rnds2_epu32(state0, state1, _mm_shuffle_epi32(tmp, _MM_SHUFFLE(0,0,3,2))); \
} while(0)
const __m128i* buffer = (const __m128i*)block;
// to byteswap when doing big-ending load for message dwords
const __m128i bswap = _mm_setr_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12);
// load initial state
__m128i abcd = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)&state[0]), _MM_SHUFFLE(0,1,2,3)); // [a,b,c,d]
__m128i efgh = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)&state[4]), _MM_SHUFFLE(0,1,2,3)); // [e,f,g,h]
@@ -200,18 +237,18 @@ static void sha256_process_shani(uint32_t* state, const uint8_t* block, size_t c
__m128i last0 = state0;
__m128i last1 = state1;
__m128i tmp, w[4];
__m128i w[4];
QROUND(0);
QROUND(1);
QROUND(2);
QROUND(3);
QROUND(4);
QROUND(5);
QROUND(6);
QROUND(7);
QROUND(8);
QROUND(9);
QROUND( 0);
QROUND( 1);
QROUND( 2);
QROUND( 3);
QROUND( 4);
QROUND( 5);
QROUND( 6);
QROUND( 7);
QROUND( 8);
QROUND( 9);
QROUND(10);
QROUND(11);
QROUND(12);
@@ -241,6 +278,140 @@ static void sha256_process_shani(uint32_t* state, const uint8_t* block, size_t c
#endif // defined(__x86_64__) || defined(_M_AMD64)
#if defined(__aarch64__) || defined(_M_ARM64)
#if defined(__clang__)
# define SHA256_TARGET __attribute__((target("sha2")))
#elif defined(__GNUC__)
# define SHA256_TARGET __attribute__((target("+sha2")))
#elif defined(_MSC_VER)
# define SHA256_TARGET
#endif
#include <arm_neon.h>
#if defined(_WIN32)
# include <windows.h>
#elif defined(__linux__)
# include <sys/auxv.h>
# include <asm/hwcap.h>
#elif defined(__APPLE__)
# include <sys/sysctl.h>
#endif
#define SHA256_CPUID_INIT (1 << 0)
#define SHA256_CPUID_ARM64 (1 << 1)
static inline int sha256_cpuid(void)
{
#if defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_SHA2)
int result = SHA256_CPUID_ARM64;
#else
static int cpuid;
int result = cpuid;
if (result == 0)
{
#if defined(_WIN32)
int has_arm64 = IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
#elif defined(__linux__)
unsigned long hwcap = getauxval(AT_HWCAP);
int has_arm64 = hwcap & HWCAP_SHA2;
#elif defined(__APPLE__)
int value = 0;
size_t valuelen = sizeof(value);
int has_arm64 = sysctlbyname("hw.optional.arm.FEAT_SHA256", &value, &valuelen, NULL, 0) == 0 && value != 0;
#else
#error unknown platform
#endif
result |= SHA256_CPUID_INIT;
if (has_arm64)
{
result |= SHA256_CPUID_ARM64;
}
cpuid = result;
}
#endif
#if defined(SHA256_CPUID_MASK)
result &= SHA256_CPUID_MASK;
#endif
return result;
}
SHA256_TARGET
static void sha256_process_arm64(uint32_t* state, const uint8_t* block, size_t count)
{
// code here is similar to x64 shani implementation
#define W(i) w[(i)%4]
#define QROUND(i) do { \
/* load 16 round constants */ \
if ((i % 4) == 0) rk = vld1q_u32_x4(&SHA256_K[4*i]); \
/* first 4 rounds reverse byte order in each 32-bit lane of input block */ \
if (i < 4) W(i) = vreinterpretq_u32_u8(vrev32q_u8(msg.val[i])); \
/* update message schedule */ \
if (i >= 4) W(i) = vsha256su0q_u32(W(i), W(i-3)); \
if (i >= 4) W(i) = vsha256su1q_u32(W(i), W(i-2), W(i-1)); \
/* add round constants */ \
uint32x4_t tmp = vaddq_u32(W(i), rk.val[i%4]); \
/* 4 round functions */ \
uint32x4_t x = vstate.val[0]; \
vstate.val[0] = vsha256hq_u32(vstate.val[0], vstate.val[1], tmp); \
vstate.val[1] = vsha256h2q_u32(vstate.val[1], x, tmp); \
} while (0)
// load initial state
uint32x4x2_t vstate = vld1q_u32_x2(state);
do
{
// remember current state
uint32x4x2_t vlast = vstate;
// load 64-byte block
uint8x16x4_t msg = vld1q_u8_x4(block);
uint32x4x4_t rk;
uint32x4_t w[4];
QROUND( 0);
QROUND( 1);
QROUND( 2);
QROUND( 3);
QROUND( 4);
QROUND( 5);
QROUND( 6);
QROUND( 7);
QROUND( 8);
QROUND( 9);
QROUND(10);
QROUND(11);
QROUND(12);
QROUND(13);
QROUND(14);
QROUND(15);
// update next state
vstate.val[0] = vaddq_u32(vstate.val[0], vlast.val[0]);
vstate.val[1] = vaddq_u32(vstate.val[1], vlast.val[1]);
block += SHA256_BLOCK_SIZE;
}
while (--count);
// save the new state
vst1q_u32_x2(state, vstate);
#undef QROUND
#undef W
}
#endif // defined(__aarch64__) || defined(_M_ARM64)
static void sha256_process(uint32_t* state, const uint8_t* block, size_t count)
{
#if defined(__x86_64__) || defined(_M_AMD64)
@@ -252,6 +423,15 @@ static void sha256_process(uint32_t* state, const uint8_t* block, size_t count)
}
#endif
#if defined(__aarch64__) || defined(_M_ARM64)
int cpuid = sha256_cpuid();
if (cpuid & SHA256_CPUID_ARM64)
{
sha256_process_arm64(state, block, count);
return;
}
#endif
#define Ch(x,y,z) ((x & (y ^ z)) ^ z)
#define Maj(x,y,z) ((x & y) | (z & (x | y)))
@@ -262,13 +442,13 @@ static void sha256_process(uint32_t* state, const uint8_t* block, size_t count)
#define W(i) w[(i+16)%16]
#define ROUND(i,a,b,c,d,e,f,g,h,K) do \
#define ROUND(i,a,b,c,d,e,f,g,h) do \
{ \
uint32_t w0; \
if (i < 16) W(i) = w0 = SHA256_GET32BE(block + i*sizeof(uint32_t)); \
if (i >= 16) W(i) = w0 = SSig1(W(i-2)) + W(i-7) + SSig0(W(i-15)) + W(i-16); \
\
uint32_t t1 = h + BSig1(e) + Ch(e,f,g) + K + w0; \
uint32_t t1 = h + BSig1(e) + Ch(e,f,g) + SHA256_K[i] + w0; \
uint32_t t2 = BSig0(a) + Maj(a,b,c); \
d += t1; \
h = t1 + t2; \
@@ -287,70 +467,70 @@ static void sha256_process(uint32_t* state, const uint8_t* block, size_t count)
uint32_t w[16];
ROUND( 0, a, b, c, d, e, f, g, h, 0x428a2f98);
ROUND( 1, h, a, b, c, d, e, f, g, 0x71374491);
ROUND( 2, g, h, a, b, c, d, e, f, 0xb5c0fbcf);
ROUND( 3, f, g, h, a, b, c, d, e, 0xe9b5dba5);
ROUND( 4, e, f, g, h, a, b, c, d, 0x3956c25b);
ROUND( 5, d, e, f, g, h, a, b, c, 0x59f111f1);
ROUND( 6, c, d, e, f, g, h, a, b, 0x923f82a4);
ROUND( 7, b, c, d, e, f, g, h, a, 0xab1c5ed5);
ROUND( 8, a, b, c, d, e, f, g, h, 0xd807aa98);
ROUND( 9, h, a, b, c, d, e, f, g, 0x12835b01);
ROUND(10, g, h, a, b, c, d, e, f, 0x243185be);
ROUND(11, f, g, h, a, b, c, d, e, 0x550c7dc3);
ROUND(12, e, f, g, h, a, b, c, d, 0x72be5d74);
ROUND(13, d, e, f, g, h, a, b, c, 0x80deb1fe);
ROUND(14, c, d, e, f, g, h, a, b, 0x9bdc06a7);
ROUND(15, b, c, d, e, f, g, h, a, 0xc19bf174);
ROUND(16, a, b, c, d, e, f, g, h, 0xe49b69c1);
ROUND(17, h, a, b, c, d, e, f, g, 0xefbe4786);
ROUND(18, g, h, a, b, c, d, e, f, 0x0fc19dc6);
ROUND(19, f, g, h, a, b, c, d, e, 0x240ca1cc);
ROUND(20, e, f, g, h, a, b, c, d, 0x2de92c6f);
ROUND(21, d, e, f, g, h, a, b, c, 0x4a7484aa);
ROUND(22, c, d, e, f, g, h, a, b, 0x5cb0a9dc);
ROUND(23, b, c, d, e, f, g, h, a, 0x76f988da);
ROUND(24, a, b, c, d, e, f, g, h, 0x983e5152);
ROUND(25, h, a, b, c, d, e, f, g, 0xa831c66d);
ROUND(26, g, h, a, b, c, d, e, f, 0xb00327c8);
ROUND(27, f, g, h, a, b, c, d, e, 0xbf597fc7);
ROUND(28, e, f, g, h, a, b, c, d, 0xc6e00bf3);
ROUND(29, d, e, f, g, h, a, b, c, 0xd5a79147);
ROUND(30, c, d, e, f, g, h, a, b, 0x06ca6351);
ROUND(31, b, c, d, e, f, g, h, a, 0x14292967);
ROUND(32, a, b, c, d, e, f, g, h, 0x27b70a85);
ROUND(33, h, a, b, c, d, e, f, g, 0x2e1b2138);
ROUND(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc);
ROUND(35, f, g, h, a, b, c, d, e, 0x53380d13);
ROUND(36, e, f, g, h, a, b, c, d, 0x650a7354);
ROUND(37, d, e, f, g, h, a, b, c, 0x766a0abb);
ROUND(38, c, d, e, f, g, h, a, b, 0x81c2c92e);
ROUND(39, b, c, d, e, f, g, h, a, 0x92722c85);
ROUND(40, a, b, c, d, e, f, g, h, 0xa2bfe8a1);
ROUND(41, h, a, b, c, d, e, f, g, 0xa81a664b);
ROUND(42, g, h, a, b, c, d, e, f, 0xc24b8b70);
ROUND(43, f, g, h, a, b, c, d, e, 0xc76c51a3);
ROUND(44, e, f, g, h, a, b, c, d, 0xd192e819);
ROUND(45, d, e, f, g, h, a, b, c, 0xd6990624);
ROUND(46, c, d, e, f, g, h, a, b, 0xf40e3585);
ROUND(47, b, c, d, e, f, g, h, a, 0x106aa070);
ROUND(48, a, b, c, d, e, f, g, h, 0x19a4c116);
ROUND(49, h, a, b, c, d, e, f, g, 0x1e376c08);
ROUND(50, g, h, a, b, c, d, e, f, 0x2748774c);
ROUND(51, f, g, h, a, b, c, d, e, 0x34b0bcb5);
ROUND(52, e, f, g, h, a, b, c, d, 0x391c0cb3);
ROUND(53, d, e, f, g, h, a, b, c, 0x4ed8aa4a);
ROUND(54, c, d, e, f, g, h, a, b, 0x5b9cca4f);
ROUND(55, b, c, d, e, f, g, h, a, 0x682e6ff3);
ROUND(56, a, b, c, d, e, f, g, h, 0x748f82ee);
ROUND(57, h, a, b, c, d, e, f, g, 0x78a5636f);
ROUND(58, g, h, a, b, c, d, e, f, 0x84c87814);
ROUND(59, f, g, h, a, b, c, d, e, 0x8cc70208);
ROUND(60, e, f, g, h, a, b, c, d, 0x90befffa);
ROUND(61, d, e, f, g, h, a, b, c, 0xa4506ceb);
ROUND(62, c, d, e, f, g, h, a, b, 0xbef9a3f7);
ROUND(63, b, c, d, e, f, g, h, a, 0xc67178f2);
ROUND( 0, a, b, c, d, e, f, g, h);
ROUND( 1, h, a, b, c, d, e, f, g);
ROUND( 2, g, h, a, b, c, d, e, f);
ROUND( 3, f, g, h, a, b, c, d, e);
ROUND( 4, e, f, g, h, a, b, c, d);
ROUND( 5, d, e, f, g, h, a, b, c);
ROUND( 6, c, d, e, f, g, h, a, b);
ROUND( 7, b, c, d, e, f, g, h, a);
ROUND( 8, a, b, c, d, e, f, g, h);
ROUND( 9, h, a, b, c, d, e, f, g);
ROUND(10, g, h, a, b, c, d, e, f);
ROUND(11, f, g, h, a, b, c, d, e);
ROUND(12, e, f, g, h, a, b, c, d);
ROUND(13, d, e, f, g, h, a, b, c);
ROUND(14, c, d, e, f, g, h, a, b);
ROUND(15, b, c, d, e, f, g, h, a);
ROUND(16, a, b, c, d, e, f, g, h);
ROUND(17, h, a, b, c, d, e, f, g);
ROUND(18, g, h, a, b, c, d, e, f);
ROUND(19, f, g, h, a, b, c, d, e);
ROUND(20, e, f, g, h, a, b, c, d);
ROUND(21, d, e, f, g, h, a, b, c);
ROUND(22, c, d, e, f, g, h, a, b);
ROUND(23, b, c, d, e, f, g, h, a);
ROUND(24, a, b, c, d, e, f, g, h);
ROUND(25, h, a, b, c, d, e, f, g);
ROUND(26, g, h, a, b, c, d, e, f);
ROUND(27, f, g, h, a, b, c, d, e);
ROUND(28, e, f, g, h, a, b, c, d);
ROUND(29, d, e, f, g, h, a, b, c);
ROUND(30, c, d, e, f, g, h, a, b);
ROUND(31, b, c, d, e, f, g, h, a);
ROUND(32, a, b, c, d, e, f, g, h);
ROUND(33, h, a, b, c, d, e, f, g);
ROUND(34, g, h, a, b, c, d, e, f);
ROUND(35, f, g, h, a, b, c, d, e);
ROUND(36, e, f, g, h, a, b, c, d);
ROUND(37, d, e, f, g, h, a, b, c);
ROUND(38, c, d, e, f, g, h, a, b);
ROUND(39, b, c, d, e, f, g, h, a);
ROUND(40, a, b, c, d, e, f, g, h);
ROUND(41, h, a, b, c, d, e, f, g);
ROUND(42, g, h, a, b, c, d, e, f);
ROUND(43, f, g, h, a, b, c, d, e);
ROUND(44, e, f, g, h, a, b, c, d);
ROUND(45, d, e, f, g, h, a, b, c);
ROUND(46, c, d, e, f, g, h, a, b);
ROUND(47, b, c, d, e, f, g, h, a);
ROUND(48, a, b, c, d, e, f, g, h);
ROUND(49, h, a, b, c, d, e, f, g);
ROUND(50, g, h, a, b, c, d, e, f);
ROUND(51, f, g, h, a, b, c, d, e);
ROUND(52, e, f, g, h, a, b, c, d);
ROUND(53, d, e, f, g, h, a, b, c);
ROUND(54, c, d, e, f, g, h, a, b);
ROUND(55, b, c, d, e, f, g, h, a);
ROUND(56, a, b, c, d, e, f, g, h);
ROUND(57, h, a, b, c, d, e, f, g);
ROUND(58, g, h, a, b, c, d, e, f);
ROUND(59, f, g, h, a, b, c, d, e);
ROUND(60, e, f, g, h, a, b, c, d);
ROUND(61, d, e, f, g, h, a, b, c);
ROUND(62, c, d, e, f, g, h, a, b);
ROUND(63, b, c, d, e, f, g, h, a);
state[0] += a;
state[1] += b;
+318 -121
View File
@@ -58,8 +58,8 @@ static inline void sha384_finish(sha384_ctx* ctx, uint8_t digest[SHA384_DIGEST_S
#if defined(_MSC_VER)
# include <stdlib.h>
# define SHA512_GET64BE(ptr) _byteswap_uint64( *((const _UNALIGNED uint64_t*)(ptr)) )
# define SHA512_SET64BE(ptr,x) *((_UNALIGNED uint64_t*)(ptr)) = _byteswap_uint64(x)
# define SHA512_GET64BE(ptr) _byteswap_uint64( *((const __unaligned uint64_t*)(ptr)) )
# define SHA512_SET64BE(ptr,x) *((__unaligned uint64_t*)(ptr)) = _byteswap_uint64(x)
#else
# define SHA512_GET64BE(ptr) \
( \
@@ -86,9 +86,33 @@ static inline void sha384_finish(sha384_ctx* ctx, uint8_t digest[SHA384_DIGEST_S
while (0)
#endif
static const uint64_t SHA512_K[80] =
{
0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
0xd807aa98a3030242, 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, 0xc19bf174cf692694,
0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65,
0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, 0xbf597fc7beef0ee4,
0xc6e00bf33da88fc2, 0xd5a79147930aa725, 0x06ca6351e003826f, 0x142929670a0e6e70,
0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 0x92722c851482353b,
0xa2bfe8a14cf10364, 0xa81a664bbc423001, 0xc24b8b70d0f89791, 0xc76c51a30654be30,
0xd192e819d6ef5218, 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8,
0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,
0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,
0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec,
0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 0xc67178f2e372532b,
0xca273eceea26619c, 0xd186b8c721c0c207, 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,
0x06f067aa72176fba, 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b,
0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817,
};
#if defined(__x86_64__) || defined(_M_AMD64)
#include <immintrin.h>
#include <immintrin.h> // AVX2 + SHA512
#if defined(__clang__) || defined(__GNUC__)
# include <cpuid.h>
@@ -117,7 +141,7 @@ static inline int sha512_cpuid(void)
{
int info[4];
SHA256_CPUID(1, info);
SHA512_CPUID(1, info);
int has_xsave = info[2] & (1 << 26);
int has_ymm = 0;
@@ -127,13 +151,13 @@ static inline int sha512_cpuid(void)
has_ymm = xcr0 & (1 << 2);
}
SHA256_CPUID_EX(7, 0, info);
SHA512_CPUID_EX(7, 0, info);
int has_avx2 = info[1] & (1 << 5);
SHA256_CPUID_EX(7, 1, info);
SHA512_CPUID_EX(7, 1, info);
int has_sha512 = info[0] & (1 << 0);
result |= SHA256_CPUID_INIT;
result |= SHA512_CPUID_INIT;
if (has_ymm && has_avx2 && has_sha512)
{
result |= SHA512_CPUID_VSHA512;
@@ -152,51 +176,32 @@ static inline int sha512_cpuid(void)
SHA512_TARGET("avx2,sha512")
static void sha512_process_vsha512(uint64_t* state, const uint8_t* block, size_t count)
{
const __m256i* buffer = (const __m256i*)block;
// pretty much same way how sha256 works, only with avx2 registers and 64-bit additions
// state is kept as two 256-bit ymm registers (8 qwords)
// to byteswap when doing big-ending load for message qwords
const __m256i bswap = _mm256_broadcastsi128_si256(_mm_setr_epi8(7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9,8));
static const uint64_t K[20][4] =
{
{ 0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc },
{ 0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 },
{ 0xd807aa98a3030242, 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 },
{ 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, 0xc19bf174cf692694 },
{ 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 },
{ 0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 },
{ 0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, 0xbf597fc7beef0ee4 },
{ 0xc6e00bf33da88fc2, 0xd5a79147930aa725, 0x06ca6351e003826f, 0x142929670a0e6e70 },
{ 0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df },
{ 0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 0x92722c851482353b },
{ 0xa2bfe8a14cf10364, 0xa81a664bbc423001, 0xc24b8b70d0f89791, 0xc76c51a30654be30 },
{ 0xd192e819d6ef5218, 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8 },
{ 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 },
{ 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 },
{ 0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec },
{ 0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 0xc67178f2e372532b },
{ 0xca273eceea26619c, 0xd186b8c721c0c207, 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 },
{ 0x06f067aa72176fba, 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b },
{ 0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c },
{ 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 },
};
// message qwords are loaded as 64-bit big-endian values
#define W(i) w[(i)%4]
// 4 wide round calculations
#define QROUND(i) do { \
/* first four rounds loads input message */ \
/* first 4 rounds load input block */ \
if (i < 4) W(i) = _mm256_shuffle_epi8(_mm256_loadu_si256(&buffer[i]), bswap); \
/* add round constant */ \
tmp = _mm256_add_epi64(W(i), _mm256_loadu_si256((const __m256i*)K[i])); \
/* update previous message qwords for next rounds */ \
/* update message schedule */ \
if (i > 2 && i < 19) W(i-3) = _mm256_sha512msg2_epi64(_mm256_add_epi64(W(i-3), _mm256_permute4x64_epi64(_mm256_blend_epi32(W(i-1), W(i), 3), _MM_SHUFFLE(0,3,2,1))), W(i)); \
if (i > 0 && i < 17) W(i-1) = _mm256_sha512msg1_epi64(W(i-1), _mm256_castsi256_si128(W(i))); \
/* add round constants */ \
__m256i tmp = _mm256_add_epi64(W(i), _mm256_loadu_si256((const __m256i*)&SHA512_K[4*i])); \
/* round functions */ \
state1 = _mm256_sha512rnds2_epi64(state1, state0, _mm256_castsi256_si128(tmp)); \
state0 = _mm256_sha512rnds2_epi64(state0, state1, _mm256_extracti128_si256(tmp, 1)); \
} while(0)
const __m256i* buffer = (const __m256i*)block;
// to byteswap when doing big-ending load for message qwords
const __m256i bswap = _mm256_broadcastsi128_si256(_mm_setr_epi8(7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9,8));
// load initial state
__m256i abcd = _mm256_permute4x64_epi64(_mm256_loadu_si256((const __m256i*)&state[0]), _MM_SHUFFLE(0,1,2,3)); // [a,b,c,d]
__m256i efgh = _mm256_permute4x64_epi64(_mm256_loadu_si256((const __m256i*)&state[4]), _MM_SHUFFLE(0,1,2,3)); // [e,f,g,h]
@@ -211,7 +216,7 @@ static void sha512_process_vsha512(uint64_t* state, const uint8_t* block, size_t
__m256i last0 = state0;
__m256i last1 = state1;
__m256i tmp, w[4];
__m256i w[4];
QROUND(0);
QROUND(1);
@@ -256,6 +261,189 @@ static void sha512_process_vsha512(uint64_t* state, const uint8_t* block, size_t
#endif // defined(__x86_64__) || defined(_M_AMD64)
#if defined(__aarch64__) || defined(_M_ARM64)
#if defined(__clang__)
# define SHA512_TARGET __attribute__((target("sha3")))
#elif defined(__GNUC__)
# define SHA512_TARGET __attribute__((target("+sha3")))
#elif defined(_MSC_VER)
# define SHA512_TARGET
#endif
#include <arm_neon.h>
#if defined(_WIN32)
# include <windows.h>
# pragma comment (lib, "advapi32")
#elif defined(__linux__)
# include <sys/auxv.h>
# include <asm/hwcap.h>
#elif defined(__APPLE__)
# include <sys/sysctl.h>
#endif
#define SHA512_CPUID_INIT (1 << 0)
#define SHA512_CPUID_ARM64 (1 << 1)
#if defined(_WIN32)
#endif
static inline int sha512_cpuid(void)
{
#if defined(__ARM_FEATURE_SHA512)
int result = SHA512_CPUID_ARM64;
#else
static int cpuid;
int result = cpuid;
if (result == 0)
{
#if defined(_WIN32)
// no sha512 bit in IsProcessorFeaturePresent function :(
uint64_t bits;
DWORD bitsize = sizeof(bits);
RegGetValueA(HKEY_LOCAL_MACHINE, "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0", "CP 4030", RRF_RT_QWORD | RRF_ZEROONFAILURE, NULL, &bits, &bitsize);
// bits from ID_AA64ISAR0_EL1
int has_arm64 = ((bits >> 15) & 0xf) == 0x2;
#elif defined(__linux__)
unsigned long hwcap = getauxval(AT_HWCAP);
int has_arm64 = hwcap & HWCAP_SHA512;
#elif defined(__APPLE__)
int value = 0;
size_t valuelen = sizeof(value);
int has_arm64 = sysctlbyname("hw.optional.arm.FEAT_SHA512", &value, &valuelen, NULL, 0) == 0 && value != 0;
#else
#error unknown platform
#endif
result |= SHA512_CPUID_INIT;
if (has_arm64)
{
result |= SHA512_CPUID_ARM64;
}
cpuid = result;
}
#endif
#if defined(SHA512_CPUID_MASK)
result &= SHA512_CPUID_MASK;
#endif
return result;
}
SHA512_TARGET
static void sha512_process_arm64(uint64_t* state, const uint8_t* block, size_t count)
{
#define W(i) w[(i)%8]
#define S(i) vstate.val[3-(i)%4]
#define DROUND(i) do { \
/* load 8 round constants */ \
if ((i % 4) == 0) rk = vld1q_u64_x4(&SHA512_K[2*i]); \
/* first 8 rounds reverse byte order in each 64-bit lane of input block */ \
if (i < 8) W(i) = vreinterpretq_u64_u8(vrev64q_u8(msg[(i/4)%2].val[i%4])); \
/* update message schedule for next rounds */ \
if (i >= 8) W(i) = vsha512su1q_u64(vsha512su0q_u64(W(i), W(i-7)), W(i-1), vextq_u64(W(i-4), W(i-3), 1)); \
/* add round constants */ \
uint64x2_t tmp = vaddq_u64(W(i), rk.val[i%4]); \
/* 2 round functions */ \
uint64x2_t x0 = vaddq_u64(vextq_u64(tmp, tmp, 1), S(i+0)); \
uint64x2_t x1 = vsha512hq_u64(x0, vextq_u64(S(i+1), S(i+0), 1), vextq_u64(S(i+2), S(i+1), 1)); \
S(i+0) = vsha512h2q_u64(x1, S(i+2), S(i+3)); \
S(i+2) = vaddq_u64(S(i+2), x1); \
} while (0)
// load initial state
uint64x2x4_t vstate = vld1q_u64_x4(state);
do
{
// remember current state
uint64x2x4_t vlast = vstate;
// load 128-byte block
uint8x16x4_t msg[2] =
{
vld1q_u8_x4(block + 0 * 16),
vld1q_u8_x4(block + 4 * 16),
};
uint64x2x4_t rk;
uint64x2_t w[8];
DROUND( 0);
DROUND( 1);
DROUND( 2);
DROUND( 3);
DROUND( 4);
DROUND( 5);
DROUND( 6);
DROUND( 7);
DROUND( 8);
DROUND( 9);
DROUND(10);
DROUND(11);
DROUND(12);
DROUND(13);
DROUND(14);
DROUND(15);
DROUND(16);
DROUND(17);
DROUND(18);
DROUND(19);
DROUND(20);
DROUND(21);
DROUND(22);
DROUND(23);
DROUND(24);
DROUND(25);
DROUND(26);
DROUND(27);
DROUND(28);
DROUND(29);
DROUND(30);
DROUND(31);
DROUND(32);
DROUND(33);
DROUND(34);
DROUND(35);
DROUND(36);
DROUND(37);
DROUND(38);
DROUND(39);
// update next state
vstate.val[0] = vaddq_u64(vstate.val[0], vlast.val[0]);
vstate.val[1] = vaddq_u64(vstate.val[1], vlast.val[1]);
vstate.val[2] = vaddq_u64(vstate.val[2], vlast.val[2]);
vstate.val[3] = vaddq_u64(vstate.val[3], vlast.val[3]);
block += SHA512_BLOCK_SIZE;
}
while (--count);
// save the new state
vst1q_u64_x4(state, vstate);
#undef DROUND
#undef S
#undef W
}
#endif // defined(__aarch64__) || defined(_M_ARM64)
static void sha512_process(uint64_t* state, const uint8_t* block, size_t count)
{
#if defined(__x86_64__) || defined(_M_AMD64)
@@ -267,6 +455,15 @@ static void sha512_process(uint64_t* state, const uint8_t* block, size_t count)
}
#endif
#if defined(__aarch64__) || defined(_M_ARM64)
int cpuid = sha512_cpuid();
if (cpuid & SHA512_CPUID_ARM64)
{
sha512_process_arm64(state, block, count);
return;
}
#endif
#define Ch(x,y,z) ((x & (y ^ z)) ^ z)
#define Maj(x,y,z) ((x & y) | (z & (x | y)))
@@ -277,13 +474,13 @@ static void sha512_process(uint64_t* state, const uint8_t* block, size_t count)
#define W(i) w[(i+16)%16]
#define ROUND(i,a,b,c,d,e,f,g,h,K) do \
#define ROUND(i,a,b,c,d,e,f,g,h) do \
{ \
uint64_t w0; \
if (i < 16) W(i) = w0 = SHA512_GET64BE(block + i*sizeof(uint64_t)); \
if (i >= 16) W(i) = w0 = SSig1(W(i-2)) + W(i-7) + SSig0(W(i-15)) + W(i-16); \
\
uint64_t t1 = h + BSig1(e) + Ch(e,f,g) + K + w0; \
uint64_t t1 = h + BSig1(e) + Ch(e,f,g) + SHA512_K[i] + w0; \
uint64_t t2 = BSig0(a) + Maj(a,b,c); \
d += t1; \
h = t1 + t2; \
@@ -302,86 +499,86 @@ static void sha512_process(uint64_t* state, const uint8_t* block, size_t count)
uint64_t w[16];
ROUND( 0, a, b, c, d, e, f, g, h, 0x428a2f98d728ae22);
ROUND( 1, h, a, b, c, d, e, f, g, 0x7137449123ef65cd);
ROUND( 2, g, h, a, b, c, d, e, f, 0xb5c0fbcfec4d3b2f);
ROUND( 3, f, g, h, a, b, c, d, e, 0xe9b5dba58189dbbc);
ROUND( 4, e, f, g, h, a, b, c, d, 0x3956c25bf348b538);
ROUND( 5, d, e, f, g, h, a, b, c, 0x59f111f1b605d019);
ROUND( 6, c, d, e, f, g, h, a, b, 0x923f82a4af194f9b);
ROUND( 7, b, c, d, e, f, g, h, a, 0xab1c5ed5da6d8118);
ROUND( 8, a, b, c, d, e, f, g, h, 0xd807aa98a3030242);
ROUND( 9, h, a, b, c, d, e, f, g, 0x12835b0145706fbe);
ROUND(10, g, h, a, b, c, d, e, f, 0x243185be4ee4b28c);
ROUND(11, f, g, h, a, b, c, d, e, 0x550c7dc3d5ffb4e2);
ROUND(12, e, f, g, h, a, b, c, d, 0x72be5d74f27b896f);
ROUND(13, d, e, f, g, h, a, b, c, 0x80deb1fe3b1696b1);
ROUND(14, c, d, e, f, g, h, a, b, 0x9bdc06a725c71235);
ROUND(15, b, c, d, e, f, g, h, a, 0xc19bf174cf692694);
ROUND(16, a, b, c, d, e, f, g, h, 0xe49b69c19ef14ad2);
ROUND(17, h, a, b, c, d, e, f, g, 0xefbe4786384f25e3);
ROUND(18, g, h, a, b, c, d, e, f, 0x0fc19dc68b8cd5b5);
ROUND(19, f, g, h, a, b, c, d, e, 0x240ca1cc77ac9c65);
ROUND(20, e, f, g, h, a, b, c, d, 0x2de92c6f592b0275);
ROUND(21, d, e, f, g, h, a, b, c, 0x4a7484aa6ea6e483);
ROUND(22, c, d, e, f, g, h, a, b, 0x5cb0a9dcbd41fbd4);
ROUND(23, b, c, d, e, f, g, h, a, 0x76f988da831153b5);
ROUND(24, a, b, c, d, e, f, g, h, 0x983e5152ee66dfab);
ROUND(25, h, a, b, c, d, e, f, g, 0xa831c66d2db43210);
ROUND(26, g, h, a, b, c, d, e, f, 0xb00327c898fb213f);
ROUND(27, f, g, h, a, b, c, d, e, 0xbf597fc7beef0ee4);
ROUND(28, e, f, g, h, a, b, c, d, 0xc6e00bf33da88fc2);
ROUND(29, d, e, f, g, h, a, b, c, 0xd5a79147930aa725);
ROUND(30, c, d, e, f, g, h, a, b, 0x06ca6351e003826f);
ROUND(31, b, c, d, e, f, g, h, a, 0x142929670a0e6e70);
ROUND(32, a, b, c, d, e, f, g, h, 0x27b70a8546d22ffc);
ROUND(33, h, a, b, c, d, e, f, g, 0x2e1b21385c26c926);
ROUND(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc5ac42aed);
ROUND(35, f, g, h, a, b, c, d, e, 0x53380d139d95b3df);
ROUND(36, e, f, g, h, a, b, c, d, 0x650a73548baf63de);
ROUND(37, d, e, f, g, h, a, b, c, 0x766a0abb3c77b2a8);
ROUND(38, c, d, e, f, g, h, a, b, 0x81c2c92e47edaee6);
ROUND(39, b, c, d, e, f, g, h, a, 0x92722c851482353b);
ROUND(40, a, b, c, d, e, f, g, h, 0xa2bfe8a14cf10364);
ROUND(41, h, a, b, c, d, e, f, g, 0xa81a664bbc423001);
ROUND(42, g, h, a, b, c, d, e, f, 0xc24b8b70d0f89791);
ROUND(43, f, g, h, a, b, c, d, e, 0xc76c51a30654be30);
ROUND(44, e, f, g, h, a, b, c, d, 0xd192e819d6ef5218);
ROUND(45, d, e, f, g, h, a, b, c, 0xd69906245565a910);
ROUND(46, c, d, e, f, g, h, a, b, 0xf40e35855771202a);
ROUND(47, b, c, d, e, f, g, h, a, 0x106aa07032bbd1b8);
ROUND(48, a, b, c, d, e, f, g, h, 0x19a4c116b8d2d0c8);
ROUND(49, h, a, b, c, d, e, f, g, 0x1e376c085141ab53);
ROUND(50, g, h, a, b, c, d, e, f, 0x2748774cdf8eeb99);
ROUND(51, f, g, h, a, b, c, d, e, 0x34b0bcb5e19b48a8);
ROUND(52, e, f, g, h, a, b, c, d, 0x391c0cb3c5c95a63);
ROUND(53, d, e, f, g, h, a, b, c, 0x4ed8aa4ae3418acb);
ROUND(54, c, d, e, f, g, h, a, b, 0x5b9cca4f7763e373);
ROUND(55, b, c, d, e, f, g, h, a, 0x682e6ff3d6b2b8a3);
ROUND(56, a, b, c, d, e, f, g, h, 0x748f82ee5defb2fc);
ROUND(57, h, a, b, c, d, e, f, g, 0x78a5636f43172f60);
ROUND(58, g, h, a, b, c, d, e, f, 0x84c87814a1f0ab72);
ROUND(59, f, g, h, a, b, c, d, e, 0x8cc702081a6439ec);
ROUND(60, e, f, g, h, a, b, c, d, 0x90befffa23631e28);
ROUND(61, d, e, f, g, h, a, b, c, 0xa4506cebde82bde9);
ROUND(62, c, d, e, f, g, h, a, b, 0xbef9a3f7b2c67915);
ROUND(63, b, c, d, e, f, g, h, a, 0xc67178f2e372532b);
ROUND(64, a, b, c, d, e, f, g, h, 0xca273eceea26619c);
ROUND(65, h, a, b, c, d, e, f, g, 0xd186b8c721c0c207);
ROUND(66, g, h, a, b, c, d, e, f, 0xeada7dd6cde0eb1e);
ROUND(67, f, g, h, a, b, c, d, e, 0xf57d4f7fee6ed178);
ROUND(68, e, f, g, h, a, b, c, d, 0x06f067aa72176fba);
ROUND(69, d, e, f, g, h, a, b, c, 0x0a637dc5a2c898a6);
ROUND(70, c, d, e, f, g, h, a, b, 0x113f9804bef90dae);
ROUND(71, b, c, d, e, f, g, h, a, 0x1b710b35131c471b);
ROUND(72, a, b, c, d, e, f, g, h, 0x28db77f523047d84);
ROUND(73, h, a, b, c, d, e, f, g, 0x32caab7b40c72493);
ROUND(74, g, h, a, b, c, d, e, f, 0x3c9ebe0a15c9bebc);
ROUND(75, f, g, h, a, b, c, d, e, 0x431d67c49c100d4c);
ROUND(76, e, f, g, h, a, b, c, d, 0x4cc5d4becb3e42b6);
ROUND(77, d, e, f, g, h, a, b, c, 0x597f299cfc657e2a);
ROUND(78, c, d, e, f, g, h, a, b, 0x5fcb6fab3ad6faec);
ROUND(79, b, c, d, e, f, g, h, a, 0x6c44198c4a475817);
ROUND( 0, a, b, c, d, e, f, g, h);
ROUND( 1, h, a, b, c, d, e, f, g);
ROUND( 2, g, h, a, b, c, d, e, f);
ROUND( 3, f, g, h, a, b, c, d, e);
ROUND( 4, e, f, g, h, a, b, c, d);
ROUND( 5, d, e, f, g, h, a, b, c);
ROUND( 6, c, d, e, f, g, h, a, b);
ROUND( 7, b, c, d, e, f, g, h, a);
ROUND( 8, a, b, c, d, e, f, g, h);
ROUND( 9, h, a, b, c, d, e, f, g);
ROUND(10, g, h, a, b, c, d, e, f);
ROUND(11, f, g, h, a, b, c, d, e);
ROUND(12, e, f, g, h, a, b, c, d);
ROUND(13, d, e, f, g, h, a, b, c);
ROUND(14, c, d, e, f, g, h, a, b);
ROUND(15, b, c, d, e, f, g, h, a);
ROUND(16, a, b, c, d, e, f, g, h);
ROUND(17, h, a, b, c, d, e, f, g);
ROUND(18, g, h, a, b, c, d, e, f);
ROUND(19, f, g, h, a, b, c, d, e);
ROUND(20, e, f, g, h, a, b, c, d);
ROUND(21, d, e, f, g, h, a, b, c);
ROUND(22, c, d, e, f, g, h, a, b);
ROUND(23, b, c, d, e, f, g, h, a);
ROUND(24, a, b, c, d, e, f, g, h);
ROUND(25, h, a, b, c, d, e, f, g);
ROUND(26, g, h, a, b, c, d, e, f);
ROUND(27, f, g, h, a, b, c, d, e);
ROUND(28, e, f, g, h, a, b, c, d);
ROUND(29, d, e, f, g, h, a, b, c);
ROUND(30, c, d, e, f, g, h, a, b);
ROUND(31, b, c, d, e, f, g, h, a);
ROUND(32, a, b, c, d, e, f, g, h);
ROUND(33, h, a, b, c, d, e, f, g);
ROUND(34, g, h, a, b, c, d, e, f);
ROUND(35, f, g, h, a, b, c, d, e);
ROUND(36, e, f, g, h, a, b, c, d);
ROUND(37, d, e, f, g, h, a, b, c);
ROUND(38, c, d, e, f, g, h, a, b);
ROUND(39, b, c, d, e, f, g, h, a);
ROUND(40, a, b, c, d, e, f, g, h);
ROUND(41, h, a, b, c, d, e, f, g);
ROUND(42, g, h, a, b, c, d, e, f);
ROUND(43, f, g, h, a, b, c, d, e);
ROUND(44, e, f, g, h, a, b, c, d);
ROUND(45, d, e, f, g, h, a, b, c);
ROUND(46, c, d, e, f, g, h, a, b);
ROUND(47, b, c, d, e, f, g, h, a);
ROUND(48, a, b, c, d, e, f, g, h);
ROUND(49, h, a, b, c, d, e, f, g);
ROUND(50, g, h, a, b, c, d, e, f);
ROUND(51, f, g, h, a, b, c, d, e);
ROUND(52, e, f, g, h, a, b, c, d);
ROUND(53, d, e, f, g, h, a, b, c);
ROUND(54, c, d, e, f, g, h, a, b);
ROUND(55, b, c, d, e, f, g, h, a);
ROUND(56, a, b, c, d, e, f, g, h);
ROUND(57, h, a, b, c, d, e, f, g);
ROUND(58, g, h, a, b, c, d, e, f);
ROUND(59, f, g, h, a, b, c, d, e);
ROUND(60, e, f, g, h, a, b, c, d);
ROUND(61, d, e, f, g, h, a, b, c);
ROUND(62, c, d, e, f, g, h, a, b);
ROUND(63, b, c, d, e, f, g, h, a);
ROUND(64, a, b, c, d, e, f, g, h);
ROUND(65, h, a, b, c, d, e, f, g);
ROUND(66, g, h, a, b, c, d, e, f);
ROUND(67, f, g, h, a, b, c, d, e);
ROUND(68, e, f, g, h, a, b, c, d);
ROUND(69, d, e, f, g, h, a, b, c);
ROUND(70, c, d, e, f, g, h, a, b);
ROUND(71, b, c, d, e, f, g, h, a);
ROUND(72, a, b, c, d, e, f, g, h);
ROUND(73, h, a, b, c, d, e, f, g);
ROUND(74, g, h, a, b, c, d, e, f);
ROUND(75, f, g, h, a, b, c, d, e);
ROUND(76, e, f, g, h, a, b, c, d);
ROUND(77, d, e, f, g, h, a, b, c);
ROUND(78, c, d, e, f, g, h, a, b);
ROUND(79, b, c, d, e, f, g, h, a);
state[0] += a;
state[1] += b;