diff --git a/src/third_party/martins_hash/md5.h b/src/third_party/martins_hash/md5.h index 54632d0e..da477d80 100644 --- a/src/third_party/martins_hash/md5.h +++ b/src/third_party/martins_hash/md5.h @@ -46,9 +46,9 @@ static inline void md5_finish(md5_ctx* ctx, uint8_t digest[MD5_DIGEST_SIZE]); #endif #if defined(_MSC_VER) -# define MD5_GET32LE(ptr) *((const _UNALIGNED uint32_t*)(ptr)) -# define MD5_SET32LE(ptr,x) *((_UNALIGNED uint32_t*)(ptr)) = (x) -# define MD5_SET64LE(ptr,x) *((_UNALIGNED uint64_t*)(ptr)) = (x) +# define MD5_GET32LE(ptr) *((const __unaligned uint32_t*)(ptr)) +# define MD5_SET32LE(ptr,x) *((__unaligned uint32_t*)(ptr)) = (x) +# define MD5_SET64LE(ptr,x) *((__unaligned uint64_t*)(ptr)) = (x) #else # define MD5_GET32LE(ptr) \ ( \ @@ -431,5 +431,5 @@ void md5_finish(md5_ctx* ctx, uint8_t digest[MD5_DIGEST_SIZE]) } #if defined(__clang__) -#pragma clang diagnostic pop +# pragma clang diagnostic pop #endif diff --git a/src/third_party/martins_hash/sha1.h b/src/third_party/martins_hash/sha1.h index 043ac986..5fd59362 100644 --- a/src/third_party/martins_hash/sha1.h +++ b/src/third_party/martins_hash/sha1.h @@ -50,9 +50,9 @@ static inline void sha1_finish(sha1_ctx* ctx, uint8_t digest[SHA1_DIGEST_SIZE]); #if defined(_MSC_VER) # include -# define SHA1_GET32BE(ptr) _byteswap_ulong( *((const _UNALIGNED uint32_t*)(ptr)) ) -# define SHA1_SET32BE(ptr,x) *((_UNALIGNED uint32_t*)(ptr)) = _byteswap_ulong(x) -# define SHA1_SET64BE(ptr,x) *((_UNALIGNED uint64_t*)(ptr)) = _byteswap_uint64(x) +# define SHA1_GET32BE(ptr) _byteswap_ulong( *((const __unaligned uint32_t*)(ptr)) ) +# define SHA1_SET32BE(ptr,x) *((__unaligned uint32_t*)(ptr)) = _byteswap_ulong(x) +# define SHA1_SET64BE(ptr,x) *((__unaligned uint64_t*)(ptr)) = _byteswap_uint64(x) #else # define SHA1_GET32BE(ptr) \ ( \ @@ -137,36 +137,86 @@ static inline int sha1_cpuid(void) SHA1_TARGET("ssse3,sha") static void sha1_process_shani(uint32_t* state, const uint8_t* block, size_t count) { - const __m128i* buffer = (const __m128i*)block; + // in SHA1 each round has two parts: + // 1) calculate message schedule dwords in w[i] + // 2) do round functions to update a/b/c/d/e state values using w[i] - // for performing two operations in one: - // 1) dwords need to be loaded as big-endian - // 2) order of dwords need to be reversed for sha instructions: [0,1,2,3] -> [3,2,1,0] - const __m128i bswap = _mm_setr_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); + // w[i] in first 16 rounds is just loaded from block bytes, as 32-bit big-endian load + + // for next rounds it is done as: + // w[i] = ROL(w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) + // where ROL(x) = 32-bit rotate left by 1 + + // this means it is possible to keep just the last 16 of w's in circular buffer + // and every new w calculated will need to update 1 to 3 previous w's + + // unrolling round calculations by 4 we get: + // w[i+0] = ROL(w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) + // w[i+1] = ROL(w[i-2] ^ w[i-7] ^ w[i-13] ^ w[i-15]) + // w[i+2] = ROL(w[i-1] ^ w[i-6] ^ w[i-12] ^ w[i-14]) + // w[i+3] = ROL(w[i+0] ^ w[i-5] ^ w[i-11] ^ w[i-13]) + + // now if you store 4 w[..] values in 128-bit SSE register, then + // W(i) = ROL( r0 ^ r1 ^ r2 ^ r3 ) + // with caveat that r0 lane 3 depends on W(i) lane 0 + + // [3] [2] [1] [0] // lanes + // r0 = [ special, w[i-1], w[i-2], w[i-3] ] + // r1 = [ w[i-5], w[i-6], w[i-7], w[i-8] ] + // r2 = [ w[i-11], w[i-12], w[i-13], w[i-14] ] + // r3 = [ w[i-13], w[i-14], w[i-15], w[i-16] ] + + // in each 4-round i'th step it is possible to incrementally update new W(..) value when + // keeping W(i) values in 4 xmm element circular buffer + + // rounds i>0: W(i-1) = r2 ^ r3 = _mm_sha1msg1_epu32(W(i-1), W(i)) + // rounds i>1: W(i-2) = W(i-2) ^ r1 = _mm_xor_si128 (W(i-2), W(i)) + // rounds i>2: W(i-3) = ROL(W(i-3) ^ r0) = _mm_sha1msg2_epu32(W(i-3), W(i)) + // then the new W(i) can be used in round function calculations + // _mm_sha1msg2_epu32 correctly handles r0 lane 3 dependency on W(i) lane 0 + + // to perform round functions on two SIMD registers with state as: + // abcd = [a,b,c,d] + // e0 = [e,0,0,0] + // use the following code to get next abcd/e0 state 4 rounds at a time: + + // tmp = _mm_sha1nexte_epu32(e0, W(i)) // rotates e0 and adds message dwords + // abcd_next = _mm_sha1rnds4_epu32(abcd, tmp, Fn) // with Fn = 0..3 round function selection + // e0_next = abcd + + // sha1nexte is not needed on first round, just regular add32(e0, W(i)) should be used + // after last round need to do extra rotation, which sha1nexte takes care when adding to last_e0 #define W(i) w[(i)%4] // 4 wide round calculations #define QROUND(i) do { \ - /* first four rounds loads input message */ \ + /* first 4 rounds load input block */ \ if (i < 4) W(i) = _mm_shuffle_epi8(_mm_loadu_si128(&buffer[i]), bswap); \ - /* update previous message dwords for next rounds */ \ + /* update message schedule */ \ if (i > 0 && i < 17) W(i-1) = _mm_sha1msg1_epu32(W(i-1), W(i)); \ - if (i > 1 && i < 18) W(i-2) = _mm_xor_si128(W(i-2), W(i)); \ + if (i > 1 && i < 18) W(i-2) = _mm_xor_si128 (W(i-2), W(i)); \ if (i > 2 && i < 19) W(i-3) = _mm_sha1msg2_epu32(W(i-3), W(i)); \ - /* calculate E from message dwords */ \ - if (i == 0) tmp = _mm_add_epi32(e0, W(i)); \ + /* calculate E plus message schedule */ \ + if (i == 0) tmp = _mm_add_epi32 (e0, W(i)); \ if (i != 0) tmp = _mm_sha1nexte_epu32(e0, W(i)); \ - /* round function */ \ + /* 4 round functions */ \ e0 = abcd; \ - abcd = _mm_sha1rnds4_epu32(abcd, tmp, (i/5)%4); \ + abcd = _mm_sha1rnds4_epu32(abcd, tmp, i/5); \ } while(0) + const __m128i* buffer = (const __m128i*)block; + + // for performing two operations in one: + // 1) dwords need to be loaded as big-endian + // 2) order of dwords need to be reversed for sha1 instructions: [0,1,2,3] -> [3,2,1,0] + const __m128i bswap = _mm_setr_epi8(15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0); + // load initial state __m128i abcd = _mm_loadu_si128((const __m128i*)state); // [d,c,b,a] __m128i e0 = _mm_loadu_si32(&state[4]); // [0,0,0,e] - // change dword order + // flip dword order, to what sha1 instructions use abcd = _mm_shuffle_epi32(abcd, _MM_SHUFFLE(0,1,2,3)); // [a,b,c,d] where a is in the top lane e0 = _mm_slli_si128(e0, 12); // [e,0,0,0] where e is in top lane @@ -183,16 +233,19 @@ static void sha1_process_shani(uint32_t* state, const uint8_t* block, size_t cou QROUND(2); QROUND(3); QROUND(4); + QROUND(5); QROUND(6); QROUND(7); QROUND(8); QROUND(9); + QROUND(10); QROUND(11); QROUND(12); QROUND(13); QROUND(14); + QROUND(15); QROUND(16); QROUND(17); @@ -221,6 +274,159 @@ static void sha1_process_shani(uint32_t* state, const uint8_t* block, size_t cou #endif // defined(__x86_64__) || defined(_M_AMD64) + +#if defined(__aarch64__) || defined(_M_ARM64) + +#if defined(__clang__) +# define SHA1_TARGET __attribute__((target("sha2"))) +#elif defined(__GNUC__) +# define SHA1_TARGET __attribute__((target("+sha2"))) +#elif defined(_MSC_VER) +# define SHA1_TARGET +#endif + +#include + +#if defined(_WIN32) +# include +#elif defined(__linux__) +# include +# include +#elif defined(__APPLE__) +# include +#endif + +#define SHA1_CPUID_INIT (1 << 0) +#define SHA1_CPUID_ARM64 (1 << 1) + +static inline int sha1_cpuid(void) +{ +#if defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_SHA2) + int result = SHA1_CPUID_ARM64; +#else + static int cpuid; + + int result = cpuid; + if (result == 0) + { +#if defined(_WIN32) + int has_arm64 = IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE); +#elif defined(__linux__) + unsigned long hwcap = getauxval(AT_HWCAP); + int has_arm64 = hwcap & HWCAP_SHA1; +#elif defined(__APPLE__) + int value = 0; + size_t valuelen = sizeof(value); + int has_arm64 = sysctlbyname("hw.optional.arm.FEAT_SHA1", &value, &valuelen, NULL, 0) == 0 && value != 0; +#else +#error unknown platform +#endif + result |= SHA1_CPUID_INIT; + if (has_arm64) + { + result |= SHA1_CPUID_ARM64; + } + + cpuid = result; + } +#endif + +#if defined(SHA1_CPUID_MASK) + result &= SHA1_CPUID_MASK; +#endif + + return result; +} + +SHA1_TARGET +static void sha1_process_arm64(uint32_t* state, const uint8_t* block, size_t count) +{ + // code here is similar to x64 shani implementation + + // message array is 16 element circular buffer + // each iteration updates 4 rounds at the same time + + #define W(i) w[(i)%4] + + #define QROUND(i,F,k) do { \ + /* update message schedule */ \ + if (i >= 4) W(i) = vsha1su0q_u32(W(i), W(i-3), W(i-2)); \ + if (i >= 4) W(i) = vsha1su1q_u32(W(i), W(i-1)); \ + /* add round constant */ \ + uint32x4_t tmp = vaddq_u32(W(i), k); \ + /* 4 round functions */ \ + uint32_t x = e0; \ + e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0)); \ + abcd = F(abcd, x, tmp); \ + } while (0) + + const uint32x4_t k0 = vdupq_n_u32(0x5a827999); + const uint32x4_t k1 = vdupq_n_u32(0x6ed9eba1); + const uint32x4_t k2 = vdupq_n_u32(0x8f1bbcdc); + const uint32x4_t k3 = vdupq_n_u32(0xca62c1d6); + + // load state - a,b,c,d,e + uint32x4_t abcd = vld1q_u32(state); + uint32_t e0 = state[4]; + + do + { + // remember current state + uint32x4_t last_abcd = abcd; + uint32_t last_e0 = e0; + + // load 64-byte block and advance pointer to next block + uint8x16x4_t msg = vld1q_u8_x4(block); + block += SHA1_BLOCK_SIZE; + + uint32x4_t w[4]; + + // for first 16 w's reverse the byte order in each 32-bit lane + W(0) = vreinterpretq_u32_u8(vrev32q_u8(msg.val[0])); + W(1) = vreinterpretq_u32_u8(vrev32q_u8(msg.val[1])); + W(2) = vreinterpretq_u32_u8(vrev32q_u8(msg.val[2])); + W(3) = vreinterpretq_u32_u8(vrev32q_u8(msg.val[3])); + + QROUND( 0, vsha1cq_u32, k0); + QROUND( 1, vsha1cq_u32, k0); + QROUND( 2, vsha1cq_u32, k0); + QROUND( 3, vsha1cq_u32, k0); + QROUND( 4, vsha1cq_u32, k0); + + QROUND( 5, vsha1pq_u32, k1); + QROUND( 6, vsha1pq_u32, k1); + QROUND( 7, vsha1pq_u32, k1); + QROUND( 8, vsha1pq_u32, k1); + QROUND( 9, vsha1pq_u32, k1); + + QROUND(10, vsha1mq_u32, k2); + QROUND(11, vsha1mq_u32, k2); + QROUND(12, vsha1mq_u32, k2); + QROUND(13, vsha1mq_u32, k2); + QROUND(14, vsha1mq_u32, k2); + + QROUND(15, vsha1pq_u32, k3); + QROUND(16, vsha1pq_u32, k3); + QROUND(17, vsha1pq_u32, k3); + QROUND(18, vsha1pq_u32, k3); + QROUND(19, vsha1pq_u32, k3); + + // update next state + abcd = vaddq_u32(abcd, last_abcd); + e0 += last_e0; + } + while (--count); + + // save state + vst1q_u32(state, abcd); + state[4] = e0; + + #undef QROUND + #undef W +} + +#endif // defined(__aarch64__) || defined(_M_ARM64) + static void sha1_process(uint32_t* state, const uint8_t* block, size_t count) { #if defined(__x86_64__) || defined(_M_AMD64) @@ -232,12 +438,21 @@ static void sha1_process(uint32_t* state, const uint8_t* block, size_t count) } #endif +#if defined(__aarch64__) || defined(_M_ARM64) + int cpuid = sha1_cpuid(); + if (cpuid & SHA1_CPUID_ARM64) + { + sha1_process_arm64(state, block, count); + return; + } +#endif + #define F1(x,y,z) (0x5a827999 + ((x & (y ^ z)) ^ z)) #define F2(x,y,z) (0x6ed9eba1 + (x ^ y ^ z)) #define F3(x,y,z) (0x8f1bbcdc + ((x & y) | (z & (x | y)))) #define F4(x,y,z) (0xca62c1d6 + (x ^ y ^ z)) - #define W(i) w[(i+16)%16] + #define W(i) w[(i)%16] #define ROUND(i,a,b,c,d,e,F) do \ { \ diff --git a/src/third_party/martins_hash/sha256.h b/src/third_party/martins_hash/sha256.h index 72a0fad6..b70a72a0 100644 --- a/src/third_party/martins_hash/sha256.h +++ b/src/third_party/martins_hash/sha256.h @@ -58,9 +58,9 @@ static inline void sha224_finish(sha224_ctx* ctx, uint8_t digest[SHA224_DIGEST_S #if defined(_MSC_VER) # include -# define SHA256_GET32BE(ptr) _byteswap_ulong( *((const _UNALIGNED uint32_t*)(ptr)) ) -# define SHA256_SET32BE(ptr,x) *((_UNALIGNED uint32_t*)(ptr)) = _byteswap_ulong(x) -# define SHA256_SET64BE(ptr,x) *((_UNALIGNED uint64_t*)(ptr)) = _byteswap_uint64(x) +# define SHA256_GET32BE(ptr) _byteswap_ulong( *((const __unaligned uint32_t*)(ptr)) ) +# define SHA256_SET32BE(ptr,x) *((__unaligned uint32_t*)(ptr)) = _byteswap_ulong(x) +# define SHA256_SET64BE(ptr,x) *((__unaligned uint64_t*)(ptr)) = _byteswap_uint64(x) #else # define SHA256_GET32BE(ptr) \ ( \ @@ -91,6 +91,26 @@ static inline void sha224_finish(sha224_ctx* ctx, uint8_t digest[SHA224_DIGEST_S while (0) #endif +static const uint32_t SHA256_K[64] = +{ + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, +}; + #if defined(__x86_64__) || defined(_M_AMD64) #include // SSSE3 @@ -145,47 +165,64 @@ static inline int sha256_cpuid(void) SHA256_TARGET("ssse3,sha") static void sha256_process_shani(uint32_t* state, const uint8_t* block, size_t count) { - const __m128i* buffer = (const __m128i*)block; + // similar way how sha1 works in with shani - // to byteswap when doing big-ending load for message dwords - const __m128i bswap = _mm_setr_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12); + // first 16 rounds loads message schedule dwords as 32-bit big endian values - static const uint32_t K[16][4] = - { - { 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 }, - { 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 }, - { 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 }, - { 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 }, - { 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc }, - { 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da }, - { 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 }, - { 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 }, - { 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 }, - { 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 }, - { 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 }, - { 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 }, - { 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 }, - { 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 }, - { 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 }, - { 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 }, - }; + // for next rounds message schedule is prepared as: + // w[i] = SSig1(w[i-2]) + w[i-7] + SSig0(w[i-15]) + w[i-16] + + // unrolled by 4: + // w[i+0] = SSig1(w[i-2]) + w[i-7] + SSig0(w[i-15]) + w[i-16] + // w[i+1] = SSig1(w[i-1]) + w[i-6] + SSig0(w[i-14]) + w[i-15] + // w[i+2] = SSig1(w[i+0]) + w[i-5] + SSig0(w[i-13]) + w[i-14] + // w[i+3] = SSig1(w[i+1]) + w[i-4] + SSig0(w[i-12]) + w[i-13] + + // there is tricky dependency for lanes 2 and 3 on result of lanes 0 and 1, but sha256msg2 op takes care of that + + // by storing W[i] word in 128-bit simd register, the message schedule becomes: + // W(i) = SSig1(r0) + r1 + SSig0(r2) + r3 + // where + is 32-bit lane addition + + // [3] [2] [1] [0] // lanes + // r0 = [ special, special, w[i-1], w[i-2] ] + // r1 = [ w[i-4], w[i-5], w[i-6], w[i-7] ] + // r2 = [ w[i-12], w[i-13], w[i-14], w[i-15] ] + // r3 = [ w[i-13], w[i-14], w[i-15], w[i-16] ] + + // rN's can be calculated from previous W(..) values: + // r0 from W(i) + // r1 from _mm_alignr_epi8(W(i), W(i-1), 4) + // r2 from W(i-1) and W(i) + // r3 from W(i-1) + + // rounds i>2: W(i-3) = _mm_sha256msg2_epu32(_mm_add_epi32( W(i-3), _mm_alignr_epi8(W(i), W(i-1), 4) ), W(i)) + // rounds i>0: W(i-1) = _mm_sha256msg1_epu32(W(i-1), W(i)) + + // round functions are done with _mm_sha256rnds2_epu32 which performs it for 2 rounds + // thus repeat it two times, as input use W(i) + K(i) - message schedule added with sha256 constants #define W(i) w[(i)%4] // 4 wide round calculations #define QROUND(i) do { \ - /* first four rounds loads input message */ \ + /* first 4 rounds load input block */ \ if (i < 4) W(i) = _mm_shuffle_epi8(_mm_loadu_si128(&buffer[i]), bswap); \ - /* add round constant */ \ - tmp = _mm_add_epi32(W(i), _mm_loadu_si128((const __m128i*)K[i])); \ - /* update previous message dwords for next rounds */ \ + /* update message schedule */ \ if (i > 2 && i < 15) W(i-3) = _mm_sha256msg2_epu32(_mm_add_epi32(W(i-3), _mm_alignr_epi8(W(i), W(i-1), 4)), W(i)); \ if (i > 0 && i < 13) W(i-1) = _mm_sha256msg1_epu32(W(i-1), W(i)); \ - /* round functions */ \ + /* add round constants */ \ + __m128i tmp = _mm_add_epi32(W(i), _mm_loadu_si128((const __m128i*)&SHA256_K[4*i])); \ + /* 4 round functions */ \ state1 = _mm_sha256rnds2_epu32(state1, state0, tmp); \ state0 = _mm_sha256rnds2_epu32(state0, state1, _mm_shuffle_epi32(tmp, _MM_SHUFFLE(0,0,3,2))); \ } while(0) - + + const __m128i* buffer = (const __m128i*)block; + + // to byteswap when doing big-ending load for message dwords + const __m128i bswap = _mm_setr_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12); + // load initial state __m128i abcd = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)&state[0]), _MM_SHUFFLE(0,1,2,3)); // [a,b,c,d] __m128i efgh = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)&state[4]), _MM_SHUFFLE(0,1,2,3)); // [e,f,g,h] @@ -200,18 +237,18 @@ static void sha256_process_shani(uint32_t* state, const uint8_t* block, size_t c __m128i last0 = state0; __m128i last1 = state1; - __m128i tmp, w[4]; + __m128i w[4]; - QROUND(0); - QROUND(1); - QROUND(2); - QROUND(3); - QROUND(4); - QROUND(5); - QROUND(6); - QROUND(7); - QROUND(8); - QROUND(9); + QROUND( 0); + QROUND( 1); + QROUND( 2); + QROUND( 3); + QROUND( 4); + QROUND( 5); + QROUND( 6); + QROUND( 7); + QROUND( 8); + QROUND( 9); QROUND(10); QROUND(11); QROUND(12); @@ -241,6 +278,140 @@ static void sha256_process_shani(uint32_t* state, const uint8_t* block, size_t c #endif // defined(__x86_64__) || defined(_M_AMD64) +#if defined(__aarch64__) || defined(_M_ARM64) + +#if defined(__clang__) +# define SHA256_TARGET __attribute__((target("sha2"))) +#elif defined(__GNUC__) +# define SHA256_TARGET __attribute__((target("+sha2"))) +#elif defined(_MSC_VER) +# define SHA256_TARGET +#endif + +#include + +#if defined(_WIN32) +# include +#elif defined(__linux__) +# include +# include +#elif defined(__APPLE__) +# include +#endif + +#define SHA256_CPUID_INIT (1 << 0) +#define SHA256_CPUID_ARM64 (1 << 1) + +static inline int sha256_cpuid(void) +{ +#if defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_SHA2) + int result = SHA256_CPUID_ARM64; +#else + static int cpuid; + + int result = cpuid; + if (result == 0) + { +#if defined(_WIN32) + int has_arm64 = IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE); +#elif defined(__linux__) + unsigned long hwcap = getauxval(AT_HWCAP); + int has_arm64 = hwcap & HWCAP_SHA2; +#elif defined(__APPLE__) + int value = 0; + size_t valuelen = sizeof(value); + int has_arm64 = sysctlbyname("hw.optional.arm.FEAT_SHA256", &value, &valuelen, NULL, 0) == 0 && value != 0; +#else +#error unknown platform +#endif + result |= SHA256_CPUID_INIT; + if (has_arm64) + { + result |= SHA256_CPUID_ARM64; + } + + cpuid = result; + } +#endif + +#if defined(SHA256_CPUID_MASK) + result &= SHA256_CPUID_MASK; +#endif + + return result; +} + +SHA256_TARGET +static void sha256_process_arm64(uint32_t* state, const uint8_t* block, size_t count) +{ + // code here is similar to x64 shani implementation + + #define W(i) w[(i)%4] + + #define QROUND(i) do { \ + /* load 16 round constants */ \ + if ((i % 4) == 0) rk = vld1q_u32_x4(&SHA256_K[4*i]); \ + /* first 4 rounds reverse byte order in each 32-bit lane of input block */ \ + if (i < 4) W(i) = vreinterpretq_u32_u8(vrev32q_u8(msg.val[i])); \ + /* update message schedule */ \ + if (i >= 4) W(i) = vsha256su0q_u32(W(i), W(i-3)); \ + if (i >= 4) W(i) = vsha256su1q_u32(W(i), W(i-2), W(i-1)); \ + /* add round constants */ \ + uint32x4_t tmp = vaddq_u32(W(i), rk.val[i%4]); \ + /* 4 round functions */ \ + uint32x4_t x = vstate.val[0]; \ + vstate.val[0] = vsha256hq_u32(vstate.val[0], vstate.val[1], tmp); \ + vstate.val[1] = vsha256h2q_u32(vstate.val[1], x, tmp); \ + } while (0) + + // load initial state + uint32x4x2_t vstate = vld1q_u32_x2(state); + + do + { + // remember current state + uint32x4x2_t vlast = vstate; + + // load 64-byte block + uint8x16x4_t msg = vld1q_u8_x4(block); + + uint32x4x4_t rk; + uint32x4_t w[4]; + + QROUND( 0); + QROUND( 1); + QROUND( 2); + QROUND( 3); + QROUND( 4); + QROUND( 5); + QROUND( 6); + QROUND( 7); + QROUND( 8); + QROUND( 9); + QROUND(10); + QROUND(11); + QROUND(12); + QROUND(13); + QROUND(14); + QROUND(15); + + // update next state + vstate.val[0] = vaddq_u32(vstate.val[0], vlast.val[0]); + vstate.val[1] = vaddq_u32(vstate.val[1], vlast.val[1]); + + block += SHA256_BLOCK_SIZE; + } + while (--count); + + // save the new state + vst1q_u32_x2(state, vstate); + + #undef QROUND + #undef W +} + +#endif // defined(__aarch64__) || defined(_M_ARM64) + static void sha256_process(uint32_t* state, const uint8_t* block, size_t count) { #if defined(__x86_64__) || defined(_M_AMD64) @@ -252,6 +423,15 @@ static void sha256_process(uint32_t* state, const uint8_t* block, size_t count) } #endif +#if defined(__aarch64__) || defined(_M_ARM64) + int cpuid = sha256_cpuid(); + if (cpuid & SHA256_CPUID_ARM64) + { + sha256_process_arm64(state, block, count); + return; + } +#endif + #define Ch(x,y,z) ((x & (y ^ z)) ^ z) #define Maj(x,y,z) ((x & y) | (z & (x | y))) @@ -262,13 +442,13 @@ static void sha256_process(uint32_t* state, const uint8_t* block, size_t count) #define W(i) w[(i+16)%16] - #define ROUND(i,a,b,c,d,e,f,g,h,K) do \ + #define ROUND(i,a,b,c,d,e,f,g,h) do \ { \ uint32_t w0; \ if (i < 16) W(i) = w0 = SHA256_GET32BE(block + i*sizeof(uint32_t)); \ if (i >= 16) W(i) = w0 = SSig1(W(i-2)) + W(i-7) + SSig0(W(i-15)) + W(i-16); \ \ - uint32_t t1 = h + BSig1(e) + Ch(e,f,g) + K + w0; \ + uint32_t t1 = h + BSig1(e) + Ch(e,f,g) + SHA256_K[i] + w0; \ uint32_t t2 = BSig0(a) + Maj(a,b,c); \ d += t1; \ h = t1 + t2; \ @@ -287,70 +467,70 @@ static void sha256_process(uint32_t* state, const uint8_t* block, size_t count) uint32_t w[16]; - ROUND( 0, a, b, c, d, e, f, g, h, 0x428a2f98); - ROUND( 1, h, a, b, c, d, e, f, g, 0x71374491); - ROUND( 2, g, h, a, b, c, d, e, f, 0xb5c0fbcf); - ROUND( 3, f, g, h, a, b, c, d, e, 0xe9b5dba5); - ROUND( 4, e, f, g, h, a, b, c, d, 0x3956c25b); - ROUND( 5, d, e, f, g, h, a, b, c, 0x59f111f1); - ROUND( 6, c, d, e, f, g, h, a, b, 0x923f82a4); - ROUND( 7, b, c, d, e, f, g, h, a, 0xab1c5ed5); - ROUND( 8, a, b, c, d, e, f, g, h, 0xd807aa98); - ROUND( 9, h, a, b, c, d, e, f, g, 0x12835b01); - ROUND(10, g, h, a, b, c, d, e, f, 0x243185be); - ROUND(11, f, g, h, a, b, c, d, e, 0x550c7dc3); - ROUND(12, e, f, g, h, a, b, c, d, 0x72be5d74); - ROUND(13, d, e, f, g, h, a, b, c, 0x80deb1fe); - ROUND(14, c, d, e, f, g, h, a, b, 0x9bdc06a7); - ROUND(15, b, c, d, e, f, g, h, a, 0xc19bf174); - ROUND(16, a, b, c, d, e, f, g, h, 0xe49b69c1); - ROUND(17, h, a, b, c, d, e, f, g, 0xefbe4786); - ROUND(18, g, h, a, b, c, d, e, f, 0x0fc19dc6); - ROUND(19, f, g, h, a, b, c, d, e, 0x240ca1cc); - ROUND(20, e, f, g, h, a, b, c, d, 0x2de92c6f); - ROUND(21, d, e, f, g, h, a, b, c, 0x4a7484aa); - ROUND(22, c, d, e, f, g, h, a, b, 0x5cb0a9dc); - ROUND(23, b, c, d, e, f, g, h, a, 0x76f988da); - ROUND(24, a, b, c, d, e, f, g, h, 0x983e5152); - ROUND(25, h, a, b, c, d, e, f, g, 0xa831c66d); - ROUND(26, g, h, a, b, c, d, e, f, 0xb00327c8); - ROUND(27, f, g, h, a, b, c, d, e, 0xbf597fc7); - ROUND(28, e, f, g, h, a, b, c, d, 0xc6e00bf3); - ROUND(29, d, e, f, g, h, a, b, c, 0xd5a79147); - ROUND(30, c, d, e, f, g, h, a, b, 0x06ca6351); - ROUND(31, b, c, d, e, f, g, h, a, 0x14292967); - ROUND(32, a, b, c, d, e, f, g, h, 0x27b70a85); - ROUND(33, h, a, b, c, d, e, f, g, 0x2e1b2138); - ROUND(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc); - ROUND(35, f, g, h, a, b, c, d, e, 0x53380d13); - ROUND(36, e, f, g, h, a, b, c, d, 0x650a7354); - ROUND(37, d, e, f, g, h, a, b, c, 0x766a0abb); - ROUND(38, c, d, e, f, g, h, a, b, 0x81c2c92e); - ROUND(39, b, c, d, e, f, g, h, a, 0x92722c85); - ROUND(40, a, b, c, d, e, f, g, h, 0xa2bfe8a1); - ROUND(41, h, a, b, c, d, e, f, g, 0xa81a664b); - ROUND(42, g, h, a, b, c, d, e, f, 0xc24b8b70); - ROUND(43, f, g, h, a, b, c, d, e, 0xc76c51a3); - ROUND(44, e, f, g, h, a, b, c, d, 0xd192e819); - ROUND(45, d, e, f, g, h, a, b, c, 0xd6990624); - ROUND(46, c, d, e, f, g, h, a, b, 0xf40e3585); - ROUND(47, b, c, d, e, f, g, h, a, 0x106aa070); - ROUND(48, a, b, c, d, e, f, g, h, 0x19a4c116); - ROUND(49, h, a, b, c, d, e, f, g, 0x1e376c08); - ROUND(50, g, h, a, b, c, d, e, f, 0x2748774c); - ROUND(51, f, g, h, a, b, c, d, e, 0x34b0bcb5); - ROUND(52, e, f, g, h, a, b, c, d, 0x391c0cb3); - ROUND(53, d, e, f, g, h, a, b, c, 0x4ed8aa4a); - ROUND(54, c, d, e, f, g, h, a, b, 0x5b9cca4f); - ROUND(55, b, c, d, e, f, g, h, a, 0x682e6ff3); - ROUND(56, a, b, c, d, e, f, g, h, 0x748f82ee); - ROUND(57, h, a, b, c, d, e, f, g, 0x78a5636f); - ROUND(58, g, h, a, b, c, d, e, f, 0x84c87814); - ROUND(59, f, g, h, a, b, c, d, e, 0x8cc70208); - ROUND(60, e, f, g, h, a, b, c, d, 0x90befffa); - ROUND(61, d, e, f, g, h, a, b, c, 0xa4506ceb); - ROUND(62, c, d, e, f, g, h, a, b, 0xbef9a3f7); - ROUND(63, b, c, d, e, f, g, h, a, 0xc67178f2); + ROUND( 0, a, b, c, d, e, f, g, h); + ROUND( 1, h, a, b, c, d, e, f, g); + ROUND( 2, g, h, a, b, c, d, e, f); + ROUND( 3, f, g, h, a, b, c, d, e); + ROUND( 4, e, f, g, h, a, b, c, d); + ROUND( 5, d, e, f, g, h, a, b, c); + ROUND( 6, c, d, e, f, g, h, a, b); + ROUND( 7, b, c, d, e, f, g, h, a); + ROUND( 8, a, b, c, d, e, f, g, h); + ROUND( 9, h, a, b, c, d, e, f, g); + ROUND(10, g, h, a, b, c, d, e, f); + ROUND(11, f, g, h, a, b, c, d, e); + ROUND(12, e, f, g, h, a, b, c, d); + ROUND(13, d, e, f, g, h, a, b, c); + ROUND(14, c, d, e, f, g, h, a, b); + ROUND(15, b, c, d, e, f, g, h, a); + ROUND(16, a, b, c, d, e, f, g, h); + ROUND(17, h, a, b, c, d, e, f, g); + ROUND(18, g, h, a, b, c, d, e, f); + ROUND(19, f, g, h, a, b, c, d, e); + ROUND(20, e, f, g, h, a, b, c, d); + ROUND(21, d, e, f, g, h, a, b, c); + ROUND(22, c, d, e, f, g, h, a, b); + ROUND(23, b, c, d, e, f, g, h, a); + ROUND(24, a, b, c, d, e, f, g, h); + ROUND(25, h, a, b, c, d, e, f, g); + ROUND(26, g, h, a, b, c, d, e, f); + ROUND(27, f, g, h, a, b, c, d, e); + ROUND(28, e, f, g, h, a, b, c, d); + ROUND(29, d, e, f, g, h, a, b, c); + ROUND(30, c, d, e, f, g, h, a, b); + ROUND(31, b, c, d, e, f, g, h, a); + ROUND(32, a, b, c, d, e, f, g, h); + ROUND(33, h, a, b, c, d, e, f, g); + ROUND(34, g, h, a, b, c, d, e, f); + ROUND(35, f, g, h, a, b, c, d, e); + ROUND(36, e, f, g, h, a, b, c, d); + ROUND(37, d, e, f, g, h, a, b, c); + ROUND(38, c, d, e, f, g, h, a, b); + ROUND(39, b, c, d, e, f, g, h, a); + ROUND(40, a, b, c, d, e, f, g, h); + ROUND(41, h, a, b, c, d, e, f, g); + ROUND(42, g, h, a, b, c, d, e, f); + ROUND(43, f, g, h, a, b, c, d, e); + ROUND(44, e, f, g, h, a, b, c, d); + ROUND(45, d, e, f, g, h, a, b, c); + ROUND(46, c, d, e, f, g, h, a, b); + ROUND(47, b, c, d, e, f, g, h, a); + ROUND(48, a, b, c, d, e, f, g, h); + ROUND(49, h, a, b, c, d, e, f, g); + ROUND(50, g, h, a, b, c, d, e, f); + ROUND(51, f, g, h, a, b, c, d, e); + ROUND(52, e, f, g, h, a, b, c, d); + ROUND(53, d, e, f, g, h, a, b, c); + ROUND(54, c, d, e, f, g, h, a, b); + ROUND(55, b, c, d, e, f, g, h, a); + ROUND(56, a, b, c, d, e, f, g, h); + ROUND(57, h, a, b, c, d, e, f, g); + ROUND(58, g, h, a, b, c, d, e, f); + ROUND(59, f, g, h, a, b, c, d, e); + ROUND(60, e, f, g, h, a, b, c, d); + ROUND(61, d, e, f, g, h, a, b, c); + ROUND(62, c, d, e, f, g, h, a, b); + ROUND(63, b, c, d, e, f, g, h, a); state[0] += a; state[1] += b; diff --git a/src/third_party/martins_hash/sha512.h b/src/third_party/martins_hash/sha512.h index 2a7dad07..4be1ef68 100644 --- a/src/third_party/martins_hash/sha512.h +++ b/src/third_party/martins_hash/sha512.h @@ -58,8 +58,8 @@ static inline void sha384_finish(sha384_ctx* ctx, uint8_t digest[SHA384_DIGEST_S #if defined(_MSC_VER) # include -# define SHA512_GET64BE(ptr) _byteswap_uint64( *((const _UNALIGNED uint64_t*)(ptr)) ) -# define SHA512_SET64BE(ptr,x) *((_UNALIGNED uint64_t*)(ptr)) = _byteswap_uint64(x) +# define SHA512_GET64BE(ptr) _byteswap_uint64( *((const __unaligned uint64_t*)(ptr)) ) +# define SHA512_SET64BE(ptr,x) *((__unaligned uint64_t*)(ptr)) = _byteswap_uint64(x) #else # define SHA512_GET64BE(ptr) \ ( \ @@ -86,9 +86,33 @@ static inline void sha384_finish(sha384_ctx* ctx, uint8_t digest[SHA384_DIGEST_S while (0) #endif +static const uint64_t SHA512_K[80] = +{ + 0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc, + 0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118, + 0xd807aa98a3030242, 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2, + 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, 0xc19bf174cf692694, + 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, + 0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5, + 0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, 0xbf597fc7beef0ee4, + 0xc6e00bf33da88fc2, 0xd5a79147930aa725, 0x06ca6351e003826f, 0x142929670a0e6e70, + 0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df, + 0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 0x92722c851482353b, + 0xa2bfe8a14cf10364, 0xa81a664bbc423001, 0xc24b8b70d0f89791, 0xc76c51a30654be30, + 0xd192e819d6ef5218, 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8, + 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8, + 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3, + 0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec, + 0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 0xc67178f2e372532b, + 0xca273eceea26619c, 0xd186b8c721c0c207, 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, + 0x06f067aa72176fba, 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b, + 0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c, + 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817, +}; + #if defined(__x86_64__) || defined(_M_AMD64) -#include +#include // AVX2 + SHA512 #if defined(__clang__) || defined(__GNUC__) # include @@ -117,7 +141,7 @@ static inline int sha512_cpuid(void) { int info[4]; - SHA256_CPUID(1, info); + SHA512_CPUID(1, info); int has_xsave = info[2] & (1 << 26); int has_ymm = 0; @@ -127,13 +151,13 @@ static inline int sha512_cpuid(void) has_ymm = xcr0 & (1 << 2); } - SHA256_CPUID_EX(7, 0, info); + SHA512_CPUID_EX(7, 0, info); int has_avx2 = info[1] & (1 << 5); - SHA256_CPUID_EX(7, 1, info); + SHA512_CPUID_EX(7, 1, info); int has_sha512 = info[0] & (1 << 0); - result |= SHA256_CPUID_INIT; + result |= SHA512_CPUID_INIT; if (has_ymm && has_avx2 && has_sha512) { result |= SHA512_CPUID_VSHA512; @@ -152,51 +176,32 @@ static inline int sha512_cpuid(void) SHA512_TARGET("avx2,sha512") static void sha512_process_vsha512(uint64_t* state, const uint8_t* block, size_t count) { - const __m256i* buffer = (const __m256i*)block; + // pretty much same way how sha256 works, only with avx2 registers and 64-bit additions + // state is kept as two 256-bit ymm registers (8 qwords) - // to byteswap when doing big-ending load for message qwords - const __m256i bswap = _mm256_broadcastsi128_si256(_mm_setr_epi8(7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9,8)); - - static const uint64_t K[20][4] = - { - { 0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc }, - { 0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 }, - { 0xd807aa98a3030242, 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 }, - { 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, 0xc19bf174cf692694 }, - { 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 }, - { 0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 }, - { 0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, 0xbf597fc7beef0ee4 }, - { 0xc6e00bf33da88fc2, 0xd5a79147930aa725, 0x06ca6351e003826f, 0x142929670a0e6e70 }, - { 0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df }, - { 0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 0x92722c851482353b }, - { 0xa2bfe8a14cf10364, 0xa81a664bbc423001, 0xc24b8b70d0f89791, 0xc76c51a30654be30 }, - { 0xd192e819d6ef5218, 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8 }, - { 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 }, - { 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 }, - { 0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec }, - { 0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 0xc67178f2e372532b }, - { 0xca273eceea26619c, 0xd186b8c721c0c207, 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 }, - { 0x06f067aa72176fba, 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b }, - { 0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c }, - { 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 }, - }; + // message qwords are loaded as 64-bit big-endian values #define W(i) w[(i)%4] // 4 wide round calculations #define QROUND(i) do { \ - /* first four rounds loads input message */ \ + /* first 4 rounds load input block */ \ if (i < 4) W(i) = _mm256_shuffle_epi8(_mm256_loadu_si256(&buffer[i]), bswap); \ - /* add round constant */ \ - tmp = _mm256_add_epi64(W(i), _mm256_loadu_si256((const __m256i*)K[i])); \ - /* update previous message qwords for next rounds */ \ + /* update message schedule */ \ if (i > 2 && i < 19) W(i-3) = _mm256_sha512msg2_epi64(_mm256_add_epi64(W(i-3), _mm256_permute4x64_epi64(_mm256_blend_epi32(W(i-1), W(i), 3), _MM_SHUFFLE(0,3,2,1))), W(i)); \ if (i > 0 && i < 17) W(i-1) = _mm256_sha512msg1_epi64(W(i-1), _mm256_castsi256_si128(W(i))); \ + /* add round constants */ \ + __m256i tmp = _mm256_add_epi64(W(i), _mm256_loadu_si256((const __m256i*)&SHA512_K[4*i])); \ /* round functions */ \ state1 = _mm256_sha512rnds2_epi64(state1, state0, _mm256_castsi256_si128(tmp)); \ state0 = _mm256_sha512rnds2_epi64(state0, state1, _mm256_extracti128_si256(tmp, 1)); \ } while(0) + const __m256i* buffer = (const __m256i*)block; + + // to byteswap when doing big-ending load for message qwords + const __m256i bswap = _mm256_broadcastsi128_si256(_mm_setr_epi8(7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9,8)); + // load initial state __m256i abcd = _mm256_permute4x64_epi64(_mm256_loadu_si256((const __m256i*)&state[0]), _MM_SHUFFLE(0,1,2,3)); // [a,b,c,d] __m256i efgh = _mm256_permute4x64_epi64(_mm256_loadu_si256((const __m256i*)&state[4]), _MM_SHUFFLE(0,1,2,3)); // [e,f,g,h] @@ -211,7 +216,7 @@ static void sha512_process_vsha512(uint64_t* state, const uint8_t* block, size_t __m256i last0 = state0; __m256i last1 = state1; - __m256i tmp, w[4]; + __m256i w[4]; QROUND(0); QROUND(1); @@ -256,6 +261,189 @@ static void sha512_process_vsha512(uint64_t* state, const uint8_t* block, size_t #endif // defined(__x86_64__) || defined(_M_AMD64) +#if defined(__aarch64__) || defined(_M_ARM64) + +#if defined(__clang__) +# define SHA512_TARGET __attribute__((target("sha3"))) +#elif defined(__GNUC__) +# define SHA512_TARGET __attribute__((target("+sha3"))) +#elif defined(_MSC_VER) +# define SHA512_TARGET +#endif + +#include + +#if defined(_WIN32) +# include +# pragma comment (lib, "advapi32") +#elif defined(__linux__) +# include +# include +#elif defined(__APPLE__) +# include +#endif + +#define SHA512_CPUID_INIT (1 << 0) +#define SHA512_CPUID_ARM64 (1 << 1) + +#if defined(_WIN32) + +#endif + +static inline int sha512_cpuid(void) +{ +#if defined(__ARM_FEATURE_SHA512) + int result = SHA512_CPUID_ARM64; +#else + static int cpuid; + + int result = cpuid; + if (result == 0) + { +#if defined(_WIN32) + // no sha512 bit in IsProcessorFeaturePresent function :( + uint64_t bits; + DWORD bitsize = sizeof(bits); + RegGetValueA(HKEY_LOCAL_MACHINE, "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0", "CP 4030", RRF_RT_QWORD | RRF_ZEROONFAILURE, NULL, &bits, &bitsize); + // bits from ID_AA64ISAR0_EL1 + int has_arm64 = ((bits >> 15) & 0xf) == 0x2; +#elif defined(__linux__) + unsigned long hwcap = getauxval(AT_HWCAP); + int has_arm64 = hwcap & HWCAP_SHA512; +#elif defined(__APPLE__) + int value = 0; + size_t valuelen = sizeof(value); + int has_arm64 = sysctlbyname("hw.optional.arm.FEAT_SHA512", &value, &valuelen, NULL, 0) == 0 && value != 0; +#else +#error unknown platform +#endif + result |= SHA512_CPUID_INIT; + if (has_arm64) + { + result |= SHA512_CPUID_ARM64; + } + + cpuid = result; + } +#endif + +#if defined(SHA512_CPUID_MASK) + result &= SHA512_CPUID_MASK; +#endif + + return result; +} + +SHA512_TARGET +static void sha512_process_arm64(uint64_t* state, const uint8_t* block, size_t count) +{ + #define W(i) w[(i)%8] + #define S(i) vstate.val[3-(i)%4] + + #define DROUND(i) do { \ + /* load 8 round constants */ \ + if ((i % 4) == 0) rk = vld1q_u64_x4(&SHA512_K[2*i]); \ + /* first 8 rounds reverse byte order in each 64-bit lane of input block */ \ + if (i < 8) W(i) = vreinterpretq_u64_u8(vrev64q_u8(msg[(i/4)%2].val[i%4])); \ + /* update message schedule for next rounds */ \ + if (i >= 8) W(i) = vsha512su1q_u64(vsha512su0q_u64(W(i), W(i-7)), W(i-1), vextq_u64(W(i-4), W(i-3), 1)); \ + /* add round constants */ \ + uint64x2_t tmp = vaddq_u64(W(i), rk.val[i%4]); \ + /* 2 round functions */ \ + uint64x2_t x0 = vaddq_u64(vextq_u64(tmp, tmp, 1), S(i+0)); \ + uint64x2_t x1 = vsha512hq_u64(x0, vextq_u64(S(i+1), S(i+0), 1), vextq_u64(S(i+2), S(i+1), 1)); \ + S(i+0) = vsha512h2q_u64(x1, S(i+2), S(i+3)); \ + S(i+2) = vaddq_u64(S(i+2), x1); \ + } while (0) + + // load initial state + uint64x2x4_t vstate = vld1q_u64_x4(state); + + do + { + // remember current state + uint64x2x4_t vlast = vstate; + + // load 128-byte block + uint8x16x4_t msg[2] = + { + vld1q_u8_x4(block + 0 * 16), + vld1q_u8_x4(block + 4 * 16), + }; + + uint64x2x4_t rk; + uint64x2_t w[8]; + + DROUND( 0); + DROUND( 1); + DROUND( 2); + DROUND( 3); + + DROUND( 4); + DROUND( 5); + DROUND( 6); + DROUND( 7); + + DROUND( 8); + DROUND( 9); + DROUND(10); + DROUND(11); + + DROUND(12); + DROUND(13); + DROUND(14); + DROUND(15); + + DROUND(16); + DROUND(17); + DROUND(18); + DROUND(19); + + DROUND(20); + DROUND(21); + DROUND(22); + DROUND(23); + + DROUND(24); + DROUND(25); + DROUND(26); + DROUND(27); + + DROUND(28); + DROUND(29); + DROUND(30); + DROUND(31); + + DROUND(32); + DROUND(33); + DROUND(34); + DROUND(35); + + DROUND(36); + DROUND(37); + DROUND(38); + DROUND(39); + + // update next state + vstate.val[0] = vaddq_u64(vstate.val[0], vlast.val[0]); + vstate.val[1] = vaddq_u64(vstate.val[1], vlast.val[1]); + vstate.val[2] = vaddq_u64(vstate.val[2], vlast.val[2]); + vstate.val[3] = vaddq_u64(vstate.val[3], vlast.val[3]); + + block += SHA512_BLOCK_SIZE; + } + while (--count); + + // save the new state + vst1q_u64_x4(state, vstate); + + #undef DROUND + #undef S + #undef W +} + +#endif // defined(__aarch64__) || defined(_M_ARM64) + static void sha512_process(uint64_t* state, const uint8_t* block, size_t count) { #if defined(__x86_64__) || defined(_M_AMD64) @@ -267,6 +455,15 @@ static void sha512_process(uint64_t* state, const uint8_t* block, size_t count) } #endif +#if defined(__aarch64__) || defined(_M_ARM64) + int cpuid = sha512_cpuid(); + if (cpuid & SHA512_CPUID_ARM64) + { + sha512_process_arm64(state, block, count); + return; + } +#endif + #define Ch(x,y,z) ((x & (y ^ z)) ^ z) #define Maj(x,y,z) ((x & y) | (z & (x | y))) @@ -277,13 +474,13 @@ static void sha512_process(uint64_t* state, const uint8_t* block, size_t count) #define W(i) w[(i+16)%16] - #define ROUND(i,a,b,c,d,e,f,g,h,K) do \ + #define ROUND(i,a,b,c,d,e,f,g,h) do \ { \ uint64_t w0; \ if (i < 16) W(i) = w0 = SHA512_GET64BE(block + i*sizeof(uint64_t)); \ if (i >= 16) W(i) = w0 = SSig1(W(i-2)) + W(i-7) + SSig0(W(i-15)) + W(i-16); \ \ - uint64_t t1 = h + BSig1(e) + Ch(e,f,g) + K + w0; \ + uint64_t t1 = h + BSig1(e) + Ch(e,f,g) + SHA512_K[i] + w0; \ uint64_t t2 = BSig0(a) + Maj(a,b,c); \ d += t1; \ h = t1 + t2; \ @@ -302,86 +499,86 @@ static void sha512_process(uint64_t* state, const uint8_t* block, size_t count) uint64_t w[16]; - ROUND( 0, a, b, c, d, e, f, g, h, 0x428a2f98d728ae22); - ROUND( 1, h, a, b, c, d, e, f, g, 0x7137449123ef65cd); - ROUND( 2, g, h, a, b, c, d, e, f, 0xb5c0fbcfec4d3b2f); - ROUND( 3, f, g, h, a, b, c, d, e, 0xe9b5dba58189dbbc); - ROUND( 4, e, f, g, h, a, b, c, d, 0x3956c25bf348b538); - ROUND( 5, d, e, f, g, h, a, b, c, 0x59f111f1b605d019); - ROUND( 6, c, d, e, f, g, h, a, b, 0x923f82a4af194f9b); - ROUND( 7, b, c, d, e, f, g, h, a, 0xab1c5ed5da6d8118); - ROUND( 8, a, b, c, d, e, f, g, h, 0xd807aa98a3030242); - ROUND( 9, h, a, b, c, d, e, f, g, 0x12835b0145706fbe); - ROUND(10, g, h, a, b, c, d, e, f, 0x243185be4ee4b28c); - ROUND(11, f, g, h, a, b, c, d, e, 0x550c7dc3d5ffb4e2); - ROUND(12, e, f, g, h, a, b, c, d, 0x72be5d74f27b896f); - ROUND(13, d, e, f, g, h, a, b, c, 0x80deb1fe3b1696b1); - ROUND(14, c, d, e, f, g, h, a, b, 0x9bdc06a725c71235); - ROUND(15, b, c, d, e, f, g, h, a, 0xc19bf174cf692694); - ROUND(16, a, b, c, d, e, f, g, h, 0xe49b69c19ef14ad2); - ROUND(17, h, a, b, c, d, e, f, g, 0xefbe4786384f25e3); - ROUND(18, g, h, a, b, c, d, e, f, 0x0fc19dc68b8cd5b5); - ROUND(19, f, g, h, a, b, c, d, e, 0x240ca1cc77ac9c65); - ROUND(20, e, f, g, h, a, b, c, d, 0x2de92c6f592b0275); - ROUND(21, d, e, f, g, h, a, b, c, 0x4a7484aa6ea6e483); - ROUND(22, c, d, e, f, g, h, a, b, 0x5cb0a9dcbd41fbd4); - ROUND(23, b, c, d, e, f, g, h, a, 0x76f988da831153b5); - ROUND(24, a, b, c, d, e, f, g, h, 0x983e5152ee66dfab); - ROUND(25, h, a, b, c, d, e, f, g, 0xa831c66d2db43210); - ROUND(26, g, h, a, b, c, d, e, f, 0xb00327c898fb213f); - ROUND(27, f, g, h, a, b, c, d, e, 0xbf597fc7beef0ee4); - ROUND(28, e, f, g, h, a, b, c, d, 0xc6e00bf33da88fc2); - ROUND(29, d, e, f, g, h, a, b, c, 0xd5a79147930aa725); - ROUND(30, c, d, e, f, g, h, a, b, 0x06ca6351e003826f); - ROUND(31, b, c, d, e, f, g, h, a, 0x142929670a0e6e70); - ROUND(32, a, b, c, d, e, f, g, h, 0x27b70a8546d22ffc); - ROUND(33, h, a, b, c, d, e, f, g, 0x2e1b21385c26c926); - ROUND(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc5ac42aed); - ROUND(35, f, g, h, a, b, c, d, e, 0x53380d139d95b3df); - ROUND(36, e, f, g, h, a, b, c, d, 0x650a73548baf63de); - ROUND(37, d, e, f, g, h, a, b, c, 0x766a0abb3c77b2a8); - ROUND(38, c, d, e, f, g, h, a, b, 0x81c2c92e47edaee6); - ROUND(39, b, c, d, e, f, g, h, a, 0x92722c851482353b); - ROUND(40, a, b, c, d, e, f, g, h, 0xa2bfe8a14cf10364); - ROUND(41, h, a, b, c, d, e, f, g, 0xa81a664bbc423001); - ROUND(42, g, h, a, b, c, d, e, f, 0xc24b8b70d0f89791); - ROUND(43, f, g, h, a, b, c, d, e, 0xc76c51a30654be30); - ROUND(44, e, f, g, h, a, b, c, d, 0xd192e819d6ef5218); - ROUND(45, d, e, f, g, h, a, b, c, 0xd69906245565a910); - ROUND(46, c, d, e, f, g, h, a, b, 0xf40e35855771202a); - ROUND(47, b, c, d, e, f, g, h, a, 0x106aa07032bbd1b8); - ROUND(48, a, b, c, d, e, f, g, h, 0x19a4c116b8d2d0c8); - ROUND(49, h, a, b, c, d, e, f, g, 0x1e376c085141ab53); - ROUND(50, g, h, a, b, c, d, e, f, 0x2748774cdf8eeb99); - ROUND(51, f, g, h, a, b, c, d, e, 0x34b0bcb5e19b48a8); - ROUND(52, e, f, g, h, a, b, c, d, 0x391c0cb3c5c95a63); - ROUND(53, d, e, f, g, h, a, b, c, 0x4ed8aa4ae3418acb); - ROUND(54, c, d, e, f, g, h, a, b, 0x5b9cca4f7763e373); - ROUND(55, b, c, d, e, f, g, h, a, 0x682e6ff3d6b2b8a3); - ROUND(56, a, b, c, d, e, f, g, h, 0x748f82ee5defb2fc); - ROUND(57, h, a, b, c, d, e, f, g, 0x78a5636f43172f60); - ROUND(58, g, h, a, b, c, d, e, f, 0x84c87814a1f0ab72); - ROUND(59, f, g, h, a, b, c, d, e, 0x8cc702081a6439ec); - ROUND(60, e, f, g, h, a, b, c, d, 0x90befffa23631e28); - ROUND(61, d, e, f, g, h, a, b, c, 0xa4506cebde82bde9); - ROUND(62, c, d, e, f, g, h, a, b, 0xbef9a3f7b2c67915); - ROUND(63, b, c, d, e, f, g, h, a, 0xc67178f2e372532b); - ROUND(64, a, b, c, d, e, f, g, h, 0xca273eceea26619c); - ROUND(65, h, a, b, c, d, e, f, g, 0xd186b8c721c0c207); - ROUND(66, g, h, a, b, c, d, e, f, 0xeada7dd6cde0eb1e); - ROUND(67, f, g, h, a, b, c, d, e, 0xf57d4f7fee6ed178); - ROUND(68, e, f, g, h, a, b, c, d, 0x06f067aa72176fba); - ROUND(69, d, e, f, g, h, a, b, c, 0x0a637dc5a2c898a6); - ROUND(70, c, d, e, f, g, h, a, b, 0x113f9804bef90dae); - ROUND(71, b, c, d, e, f, g, h, a, 0x1b710b35131c471b); - ROUND(72, a, b, c, d, e, f, g, h, 0x28db77f523047d84); - ROUND(73, h, a, b, c, d, e, f, g, 0x32caab7b40c72493); - ROUND(74, g, h, a, b, c, d, e, f, 0x3c9ebe0a15c9bebc); - ROUND(75, f, g, h, a, b, c, d, e, 0x431d67c49c100d4c); - ROUND(76, e, f, g, h, a, b, c, d, 0x4cc5d4becb3e42b6); - ROUND(77, d, e, f, g, h, a, b, c, 0x597f299cfc657e2a); - ROUND(78, c, d, e, f, g, h, a, b, 0x5fcb6fab3ad6faec); - ROUND(79, b, c, d, e, f, g, h, a, 0x6c44198c4a475817); + ROUND( 0, a, b, c, d, e, f, g, h); + ROUND( 1, h, a, b, c, d, e, f, g); + ROUND( 2, g, h, a, b, c, d, e, f); + ROUND( 3, f, g, h, a, b, c, d, e); + ROUND( 4, e, f, g, h, a, b, c, d); + ROUND( 5, d, e, f, g, h, a, b, c); + ROUND( 6, c, d, e, f, g, h, a, b); + ROUND( 7, b, c, d, e, f, g, h, a); + ROUND( 8, a, b, c, d, e, f, g, h); + ROUND( 9, h, a, b, c, d, e, f, g); + ROUND(10, g, h, a, b, c, d, e, f); + ROUND(11, f, g, h, a, b, c, d, e); + ROUND(12, e, f, g, h, a, b, c, d); + ROUND(13, d, e, f, g, h, a, b, c); + ROUND(14, c, d, e, f, g, h, a, b); + ROUND(15, b, c, d, e, f, g, h, a); + ROUND(16, a, b, c, d, e, f, g, h); + ROUND(17, h, a, b, c, d, e, f, g); + ROUND(18, g, h, a, b, c, d, e, f); + ROUND(19, f, g, h, a, b, c, d, e); + ROUND(20, e, f, g, h, a, b, c, d); + ROUND(21, d, e, f, g, h, a, b, c); + ROUND(22, c, d, e, f, g, h, a, b); + ROUND(23, b, c, d, e, f, g, h, a); + ROUND(24, a, b, c, d, e, f, g, h); + ROUND(25, h, a, b, c, d, e, f, g); + ROUND(26, g, h, a, b, c, d, e, f); + ROUND(27, f, g, h, a, b, c, d, e); + ROUND(28, e, f, g, h, a, b, c, d); + ROUND(29, d, e, f, g, h, a, b, c); + ROUND(30, c, d, e, f, g, h, a, b); + ROUND(31, b, c, d, e, f, g, h, a); + ROUND(32, a, b, c, d, e, f, g, h); + ROUND(33, h, a, b, c, d, e, f, g); + ROUND(34, g, h, a, b, c, d, e, f); + ROUND(35, f, g, h, a, b, c, d, e); + ROUND(36, e, f, g, h, a, b, c, d); + ROUND(37, d, e, f, g, h, a, b, c); + ROUND(38, c, d, e, f, g, h, a, b); + ROUND(39, b, c, d, e, f, g, h, a); + ROUND(40, a, b, c, d, e, f, g, h); + ROUND(41, h, a, b, c, d, e, f, g); + ROUND(42, g, h, a, b, c, d, e, f); + ROUND(43, f, g, h, a, b, c, d, e); + ROUND(44, e, f, g, h, a, b, c, d); + ROUND(45, d, e, f, g, h, a, b, c); + ROUND(46, c, d, e, f, g, h, a, b); + ROUND(47, b, c, d, e, f, g, h, a); + ROUND(48, a, b, c, d, e, f, g, h); + ROUND(49, h, a, b, c, d, e, f, g); + ROUND(50, g, h, a, b, c, d, e, f); + ROUND(51, f, g, h, a, b, c, d, e); + ROUND(52, e, f, g, h, a, b, c, d); + ROUND(53, d, e, f, g, h, a, b, c); + ROUND(54, c, d, e, f, g, h, a, b); + ROUND(55, b, c, d, e, f, g, h, a); + ROUND(56, a, b, c, d, e, f, g, h); + ROUND(57, h, a, b, c, d, e, f, g); + ROUND(58, g, h, a, b, c, d, e, f); + ROUND(59, f, g, h, a, b, c, d, e); + ROUND(60, e, f, g, h, a, b, c, d); + ROUND(61, d, e, f, g, h, a, b, c); + ROUND(62, c, d, e, f, g, h, a, b); + ROUND(63, b, c, d, e, f, g, h, a); + ROUND(64, a, b, c, d, e, f, g, h); + ROUND(65, h, a, b, c, d, e, f, g); + ROUND(66, g, h, a, b, c, d, e, f); + ROUND(67, f, g, h, a, b, c, d, e); + ROUND(68, e, f, g, h, a, b, c, d); + ROUND(69, d, e, f, g, h, a, b, c); + ROUND(70, c, d, e, f, g, h, a, b); + ROUND(71, b, c, d, e, f, g, h, a); + ROUND(72, a, b, c, d, e, f, g, h); + ROUND(73, h, a, b, c, d, e, f, g); + ROUND(74, g, h, a, b, c, d, e, f); + ROUND(75, f, g, h, a, b, c, d, e); + ROUND(76, e, f, g, h, a, b, c, d); + ROUND(77, d, e, f, g, h, a, b, c); + ROUND(78, c, d, e, f, g, h, a, b); + ROUND(79, b, c, d, e, f, g, h, a); state[0] += a; state[1] += b;