update third_party/martins_hash with latest code

2026-07-29 18:50:03 +00:00 · 2025-10-20 14:45:09 -07:00
parent 4bc6240ac0
commit 6589cbe374
4 changed files with 842 additions and 250 deletions
@@ -46,9 +46,9 @@ static inline void md5_finish(md5_ctx* ctx, uint8_t digest[MD5_DIGEST_SIZE]);
 #endif

 #if defined(_MSC_VER)
-#   define MD5_GET32LE(ptr) *((const _UNALIGNED uint32_t*)(ptr))
-#   define MD5_SET32LE(ptr,x) *((_UNALIGNED uint32_t*)(ptr)) = (x)
-#   define MD5_SET64LE(ptr,x) *((_UNALIGNED uint64_t*)(ptr)) = (x)
+#   define MD5_GET32LE(ptr) *((const __unaligned uint32_t*)(ptr))
+#   define MD5_SET32LE(ptr,x) *((__unaligned uint32_t*)(ptr)) = (x)
+#   define MD5_SET64LE(ptr,x) *((__unaligned uint64_t*)(ptr)) = (x)
 #else
 #   define MD5_GET32LE(ptr) \
    (                       \
@@ -431,5 +431,5 @@ void md5_finish(md5_ctx* ctx, uint8_t digest[MD5_DIGEST_SIZE])
 }

 #if defined(__clang__)
-#pragma clang diagnostic pop
+#   pragma clang diagnostic pop
 #endif
@@ -50,9 +50,9 @@ static inline void sha1_finish(sha1_ctx* ctx, uint8_t digest[SHA1_DIGEST_SIZE]);

 #if defined(_MSC_VER)
 #   include <stdlib.h>
-#   define SHA1_GET32BE(ptr) _byteswap_ulong( *((const _UNALIGNED uint32_t*)(ptr)) )
-#   define SHA1_SET32BE(ptr,x) *((_UNALIGNED uint32_t*)(ptr)) = _byteswap_ulong(x)
-#   define SHA1_SET64BE(ptr,x) *((_UNALIGNED uint64_t*)(ptr)) = _byteswap_uint64(x)
+#   define SHA1_GET32BE(ptr) _byteswap_ulong( *((const __unaligned uint32_t*)(ptr)) )
+#   define SHA1_SET32BE(ptr,x) *((__unaligned uint32_t*)(ptr)) = _byteswap_ulong(x)
+#   define SHA1_SET64BE(ptr,x) *((__unaligned uint64_t*)(ptr)) = _byteswap_uint64(x)
 #else
 #   define SHA1_GET32BE(ptr) \
    (                       \
@@ -137,36 +137,86 @@ static inline int sha1_cpuid(void)
 SHA1_TARGET("ssse3,sha")
 static void sha1_process_shani(uint32_t* state, const uint8_t* block, size_t count)
 {
-    const __m128i* buffer = (const __m128i*)block;
+    // in SHA1 each round has two parts:
+    // 1) calculate message schedule dwords in w[i]
+    // 2) do round functions to update a/b/c/d/e state values using w[i]

-    // for performing two operations in one:
-    // 1) dwords need to be loaded as big-endian
-    // 2) order of dwords need to be reversed for sha instructions: [0,1,2,3] -> [3,2,1,0]
-    const __m128i bswap = _mm_setr_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
+    // w[i] in first 16 rounds is just loaded from block bytes, as 32-bit big-endian load
+
+    // for next rounds it is done as:
+    // w[i] = ROL(w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16])
+    // where ROL(x) = 32-bit rotate left by 1
+
+    // this means it is possible to keep just the last 16 of w's in circular buffer
+    // and every new w calculated will need to update 1 to 3 previous w's
+
+    // unrolling round calculations by 4 we get:
+    // w[i+0] = ROL(w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16])
+    // w[i+1] = ROL(w[i-2] ^ w[i-7] ^ w[i-13] ^ w[i-15])
+    // w[i+2] = ROL(w[i-1] ^ w[i-6] ^ w[i-12] ^ w[i-14])
+    // w[i+3] = ROL(w[i+0] ^ w[i-5] ^ w[i-11] ^ w[i-13])
+
+    // now if you store 4 w[..] values in 128-bit SSE register, then
+    // W(i) = ROL( r0 ^ r1 ^ r2 ^ r3 )
+    // with caveat that r0 lane 3 depends on W(i) lane 0
+
+    //         [3]      [2]      [1]      [0]      // lanes
+    // r0 = [ special, w[i-1],  w[i-2],  w[i-3]  ]
+    // r1 = [ w[i-5],  w[i-6],  w[i-7],  w[i-8]  ]
+    // r2 = [ w[i-11], w[i-12], w[i-13], w[i-14] ]
+    // r3 = [ w[i-13], w[i-14], w[i-15], w[i-16] ]
+
+    // in each 4-round i'th step it is possible to incrementally update new W(..) value when
+    // keeping W(i) values in 4 xmm element circular buffer
+
+    // rounds i>0: W(i-1) = r2 ^ r3          = _mm_sha1msg1_epu32(W(i-1), W(i))
+    // rounds i>1: W(i-2) = W(i-2) ^ r1      = _mm_xor_si128     (W(i-2), W(i))
+    // rounds i>2: W(i-3) = ROL(W(i-3) ^ r0) = _mm_sha1msg2_epu32(W(i-3), W(i))
+    // then the new W(i) can be used in round function calculations
+    // _mm_sha1msg2_epu32 correctly handles r0 lane 3 dependency on W(i) lane 0
+
+    // to perform round functions on two SIMD registers with state as:
+    // abcd = [a,b,c,d]
+    //   e0 = [e,0,0,0]
+    // use the following code to get next abcd/e0 state 4 rounds at a time:
+
+    //       tmp = _mm_sha1nexte_epu32(e0, W(i))      // rotates e0 and adds message dwords
+    // abcd_next = _mm_sha1rnds4_epu32(abcd, tmp, Fn) // with Fn = 0..3 round function selection
+    //   e0_next = abcd
+
+    // sha1nexte is not needed on first round, just regular add32(e0, W(i)) should be used
+    // after last round need to do extra rotation, which sha1nexte takes care when adding to last_e0

    #define W(i) w[(i)%4]

    // 4 wide round calculations
    #define QROUND(i) do {                                                          \
-        /* first four rounds loads input message */                                 \
+        /* first 4 rounds load input block */                                       \
        if (i < 4) W(i) = _mm_shuffle_epi8(_mm_loadu_si128(&buffer[i]), bswap);     \
-        /* update previous message dwords for next rounds */                        \
+        /* update message schedule */                                               \
        if (i > 0 && i < 17) W(i-1) = _mm_sha1msg1_epu32(W(i-1), W(i));             \
-        if (i > 1 && i < 18) W(i-2) = _mm_xor_si128(W(i-2), W(i));                  \
+        if (i > 1 && i < 18) W(i-2) = _mm_xor_si128     (W(i-2), W(i));             \
        if (i > 2 && i < 19) W(i-3) = _mm_sha1msg2_epu32(W(i-3), W(i));             \
-        /* calculate E from message dwords */                                       \
-        if (i == 0) tmp = _mm_add_epi32(e0, W(i));                                  \
+        /* calculate E plus message schedule */                                     \
+        if (i == 0) tmp = _mm_add_epi32      (e0, W(i));                            \
        if (i != 0) tmp = _mm_sha1nexte_epu32(e0, W(i));                            \
-        /* round function */                                                        \
+        /* 4 round functions */                                                     \
        e0 = abcd;                                                                  \
-        abcd = _mm_sha1rnds4_epu32(abcd, tmp, (i/5)%4);                             \
+        abcd = _mm_sha1rnds4_epu32(abcd, tmp, i/5);                                 \
    } while(0)

+    const __m128i* buffer = (const __m128i*)block;
+
+    // for performing two operations in one:
+    // 1) dwords need to be loaded as big-endian
+    // 2) order of dwords need to be reversed for sha1 instructions: [0,1,2,3] -> [3,2,1,0]
+    const __m128i bswap = _mm_setr_epi8(15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0);
+
    // load initial state
    __m128i abcd = _mm_loadu_si128((const __m128i*)state); // [d,c,b,a]
    __m128i e0 = _mm_loadu_si32(&state[4]);                // [0,0,0,e]

-    // change dword order
+    // flip dword order, to what sha1 instructions use
    abcd = _mm_shuffle_epi32(abcd, _MM_SHUFFLE(0,1,2,3)); // [a,b,c,d] where a is in the top lane
    e0 = _mm_slli_si128(e0, 12);                          // [e,0,0,0] where e is in top lane

@@ -183,16 +233,19 @@ static void sha1_process_shani(uint32_t* state, const uint8_t* block, size_t cou
        QROUND(2);
        QROUND(3);
        QROUND(4);
+
        QROUND(5);
        QROUND(6);
        QROUND(7);
        QROUND(8);
        QROUND(9);
+
        QROUND(10);
        QROUND(11);
        QROUND(12);
        QROUND(13);
        QROUND(14);
+
        QROUND(15);
        QROUND(16);
        QROUND(17);
@@ -221,6 +274,159 @@ static void sha1_process_shani(uint32_t* state, const uint8_t* block, size_t cou

 #endif // defined(__x86_64__) || defined(_M_AMD64)

+
+#if defined(__aarch64__) || defined(_M_ARM64)
+
+#if defined(__clang__)
+#   define SHA1_TARGET __attribute__((target("sha2")))
+#elif defined(__GNUC__)
+#   define SHA1_TARGET __attribute__((target("+sha2")))
+#elif defined(_MSC_VER)
+#   define SHA1_TARGET
+#endif
+
+#include <arm_neon.h>
+
+#if defined(_WIN32)
+#   include <windows.h>
+#elif defined(__linux__)
+#   include <sys/auxv.h>
+#   include <asm/hwcap.h>
+#elif defined(__APPLE__)
+#   include <sys/sysctl.h>
+#endif
+
+#define SHA1_CPUID_INIT  (1 << 0)
+#define SHA1_CPUID_ARM64 (1 << 1)
+
+static inline int sha1_cpuid(void)
+{
+#if defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_SHA2)
+    int result = SHA1_CPUID_ARM64;
+#else
+    static int cpuid;
+
+    int result = cpuid;
+    if (result == 0)
+    {
+#if defined(_WIN32)
+        int has_arm64 = IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
+#elif defined(__linux__)
+        unsigned long hwcap = getauxval(AT_HWCAP);
+        int has_arm64 = hwcap & HWCAP_SHA1;
+#elif defined(__APPLE__)
+        int value = 0;
+        size_t valuelen = sizeof(value);
+        int has_arm64 = sysctlbyname("hw.optional.arm.FEAT_SHA1", &value, &valuelen, NULL, 0) == 0 && value != 0;
+#else
+#error unknown platform
+#endif
+        result |= SHA1_CPUID_INIT;
+        if (has_arm64)
+        {
+            result |= SHA1_CPUID_ARM64;
+        }
+
+        cpuid = result;
+    }
+#endif
+
+#if defined(SHA1_CPUID_MASK)
+    result &= SHA1_CPUID_MASK;
+#endif
+
+    return result;
+}
+
+SHA1_TARGET
+static void sha1_process_arm64(uint32_t* state, const uint8_t* block, size_t count)
+{
+    // code here is similar to x64 shani implementation
+
+    // message array is 16 element circular buffer
+    // each iteration updates 4 rounds at the same time
+
+    #define W(i) w[(i)%4]
+
+    #define QROUND(i,F,k) do {                                  \
+        /* update message schedule */                           \
+        if (i >= 4) W(i) = vsha1su0q_u32(W(i), W(i-3), W(i-2)); \
+        if (i >= 4) W(i) = vsha1su1q_u32(W(i), W(i-1));         \
+        /* add round constant */                                \
+        uint32x4_t tmp = vaddq_u32(W(i), k);                    \
+        /* 4 round functions */                                 \
+        uint32_t x = e0;                                        \
+        e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0));               \
+        abcd = F(abcd, x, tmp);                                 \
+    } while (0)
+
+    const uint32x4_t k0 = vdupq_n_u32(0x5a827999);
+    const uint32x4_t k1 = vdupq_n_u32(0x6ed9eba1);
+    const uint32x4_t k2 = vdupq_n_u32(0x8f1bbcdc);
+    const uint32x4_t k3 = vdupq_n_u32(0xca62c1d6);
+
+    // load state - a,b,c,d,e
+    uint32x4_t abcd = vld1q_u32(state);
+    uint32_t   e0   = state[4];
+
+    do
+    {
+        // remember current state
+        uint32x4_t last_abcd = abcd;
+        uint32_t   last_e0   = e0;
+
+        // load 64-byte block and advance pointer to next block
+        uint8x16x4_t msg = vld1q_u8_x4(block);
+        block += SHA1_BLOCK_SIZE;
+
+        uint32x4_t w[4];
+
+        // for first 16 w's reverse the byte order in each 32-bit lane
+        W(0) = vreinterpretq_u32_u8(vrev32q_u8(msg.val[0]));
+        W(1) = vreinterpretq_u32_u8(vrev32q_u8(msg.val[1]));
+        W(2) = vreinterpretq_u32_u8(vrev32q_u8(msg.val[2]));
+        W(3) = vreinterpretq_u32_u8(vrev32q_u8(msg.val[3]));
+
+        QROUND( 0, vsha1cq_u32, k0);
+        QROUND( 1, vsha1cq_u32, k0);
+        QROUND( 2, vsha1cq_u32, k0);
+        QROUND( 3, vsha1cq_u32, k0);
+        QROUND( 4, vsha1cq_u32, k0);
+
+        QROUND( 5, vsha1pq_u32, k1);
+        QROUND( 6, vsha1pq_u32, k1);
+        QROUND( 7, vsha1pq_u32, k1);
+        QROUND( 8, vsha1pq_u32, k1);
+        QROUND( 9, vsha1pq_u32, k1);
+
+        QROUND(10, vsha1mq_u32, k2);
+        QROUND(11, vsha1mq_u32, k2);
+        QROUND(12, vsha1mq_u32, k2);
+        QROUND(13, vsha1mq_u32, k2);
+        QROUND(14, vsha1mq_u32, k2);
+
+        QROUND(15, vsha1pq_u32, k3);
+        QROUND(16, vsha1pq_u32, k3);
+        QROUND(17, vsha1pq_u32, k3);
+        QROUND(18, vsha1pq_u32, k3);
+        QROUND(19, vsha1pq_u32, k3);
+
+        // update next state
+        abcd = vaddq_u32(abcd, last_abcd);
+        e0  += last_e0;
+    }
+    while (--count);
+
+    // save state
+    vst1q_u32(state, abcd);
+    state[4] = e0;
+
+    #undef QROUND
+    #undef W
+}
+
+#endif // defined(__aarch64__) || defined(_M_ARM64)
+
 static void sha1_process(uint32_t* state, const uint8_t* block, size_t count)
 {
 #if defined(__x86_64__) || defined(_M_AMD64)
@@ -232,12 +438,21 @@ static void sha1_process(uint32_t* state, const uint8_t* block, size_t count)
    }
 #endif

+#if defined(__aarch64__) || defined(_M_ARM64)
+    int cpuid = sha1_cpuid();
+    if (cpuid & SHA1_CPUID_ARM64)
+    {
+        sha1_process_arm64(state, block, count);
+        return;
+    }    
+#endif
+
    #define F1(x,y,z) (0x5a827999 + ((x & (y ^ z)) ^ z))
    #define F2(x,y,z) (0x6ed9eba1 + (x ^ y ^ z))
    #define F3(x,y,z) (0x8f1bbcdc + ((x & y) | (z & (x | y))))
    #define F4(x,y,z) (0xca62c1d6 + (x ^ y ^ z))

-    #define W(i) w[(i+16)%16]
+    #define W(i) w[(i)%16]

    #define ROUND(i,a,b,c,d,e,F) do                                                     \
    {                                                                                   \
@@ -58,9 +58,9 @@ static inline void sha224_finish(sha224_ctx* ctx, uint8_t digest[SHA224_DIGEST_S

 #if defined(_MSC_VER)
 #   include <stdlib.h>
-#   define SHA256_GET32BE(ptr) _byteswap_ulong( *((const _UNALIGNED uint32_t*)(ptr)) )
-#   define SHA256_SET32BE(ptr,x) *((_UNALIGNED uint32_t*)(ptr)) = _byteswap_ulong(x)
-#   define SHA256_SET64BE(ptr,x) *((_UNALIGNED uint64_t*)(ptr)) = _byteswap_uint64(x)
+#   define SHA256_GET32BE(ptr) _byteswap_ulong( *((const __unaligned uint32_t*)(ptr)) )
+#   define SHA256_SET32BE(ptr,x) *((__unaligned uint32_t*)(ptr)) = _byteswap_ulong(x)
+#   define SHA256_SET64BE(ptr,x) *((__unaligned uint64_t*)(ptr)) = _byteswap_uint64(x)
 #else
 #   define SHA256_GET32BE(ptr)  \
    (                           \
@@ -91,6 +91,26 @@ static inline void sha224_finish(sha224_ctx* ctx, uint8_t digest[SHA224_DIGEST_S
    while (0)
 #endif

+static const uint32_t SHA256_K[64] =
+{
+    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+    0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+    0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+    0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+    0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+    0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
+};
+
 #if defined(__x86_64__) || defined(_M_AMD64)

 #include <tmmintrin.h> // SSSE3
@@ -145,47 +165,64 @@ static inline int sha256_cpuid(void)
 SHA256_TARGET("ssse3,sha")
 static void sha256_process_shani(uint32_t* state, const uint8_t* block, size_t count)
 {
-    const __m128i* buffer = (const __m128i*)block;
+    // similar way how sha1 works in with shani

-    // to byteswap when doing big-ending load for message dwords
-    const __m128i bswap = _mm_setr_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12);
+    // first 16 rounds loads message schedule dwords as 32-bit big endian values

-    static const uint32_t K[16][4] =
-    {
-        { 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 },
-        { 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 },
-        { 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 },
-        { 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 },
-        { 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc },
-        { 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da },
-        { 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 },
-        { 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 },
-        { 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 },
-        { 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 },
-        { 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 },
-        { 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 },
-        { 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 },
-        { 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 },
-        { 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 },
-        { 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 },
-    };
+    // for next rounds message schedule is prepared as:
+    // w[i] = SSig1(w[i-2]) + w[i-7] + SSig0(w[i-15]) + w[i-16]
+
+    // unrolled by 4:
+    // w[i+0] = SSig1(w[i-2]) + w[i-7] + SSig0(w[i-15]) + w[i-16]
+    // w[i+1] = SSig1(w[i-1]) + w[i-6] + SSig0(w[i-14]) + w[i-15]
+    // w[i+2] = SSig1(w[i+0]) + w[i-5] + SSig0(w[i-13]) + w[i-14]
+    // w[i+3] = SSig1(w[i+1]) + w[i-4] + SSig0(w[i-12]) + w[i-13]
+
+    // there is tricky dependency for lanes 2 and 3 on result of lanes 0 and 1, but sha256msg2 op takes care of that
+
+    // by storing W[i] word in 128-bit simd register, the message schedule becomes:
+    // W(i) = SSig1(r0) + r1 + SSig0(r2) + r3
+    // where + is 32-bit lane addition
+
+    //         [3]      [2]      [1]      [0]      // lanes
+    // r0 = [ special, special, w[i-1],  w[i-2]  ]
+    // r1 = [ w[i-4],  w[i-5],  w[i-6],  w[i-7]  ]
+    // r2 = [ w[i-12], w[i-13], w[i-14], w[i-15] ]
+    // r3 = [ w[i-13], w[i-14], w[i-15], w[i-16] ]
+
+    // rN's can be calculated from previous W(..) values:
+    // r0 from W(i)
+    // r1 from _mm_alignr_epi8(W(i), W(i-1), 4)
+    // r2 from W(i-1) and W(i)
+    // r3 from W(i-1)
+
+    // rounds i>2: W(i-3) = _mm_sha256msg2_epu32(_mm_add_epi32( W(i-3), _mm_alignr_epi8(W(i), W(i-1), 4) ), W(i))
+    // rounds i>0: W(i-1) = _mm_sha256msg1_epu32(W(i-1), W(i))
+
+    // round functions are done with _mm_sha256rnds2_epu32 which performs it for 2 rounds
+    // thus repeat it two times, as input use W(i) + K(i) - message schedule added with sha256 constants

    #define W(i) w[(i)%4]

    // 4 wide round calculations
    #define QROUND(i) do {                                                                                                  \
-        /* first four rounds loads input message */                                                                         \
+        /* first 4 rounds load input block */                                                                               \
        if (i < 4) W(i) = _mm_shuffle_epi8(_mm_loadu_si128(&buffer[i]), bswap);                                             \
-        /* add round constant */                                                                                            \
-        tmp = _mm_add_epi32(W(i), _mm_loadu_si128((const __m128i*)K[i]));                                                   \
-        /* update previous message dwords for next rounds */                                                                \
+        /* update message schedule */                                                                                       \
        if (i > 2 && i < 15) W(i-3) = _mm_sha256msg2_epu32(_mm_add_epi32(W(i-3), _mm_alignr_epi8(W(i), W(i-1), 4)), W(i));  \
        if (i > 0 && i < 13) W(i-1) = _mm_sha256msg1_epu32(W(i-1), W(i));                                                   \
-        /* round functions */                                                                                               \
+        /* add round constants */                                                                                           \
+        __m128i tmp = _mm_add_epi32(W(i), _mm_loadu_si128((const __m128i*)&SHA256_K[4*i]));                                 \
+        /* 4 round functions */                                                                                             \
        state1 = _mm_sha256rnds2_epu32(state1, state0, tmp);                                                                \
        state0 = _mm_sha256rnds2_epu32(state0, state1, _mm_shuffle_epi32(tmp, _MM_SHUFFLE(0,0,3,2)));                       \
    } while(0)
-        
+
+    const __m128i* buffer = (const __m128i*)block;
+
+    // to byteswap when doing big-ending load for message dwords
+    const __m128i bswap = _mm_setr_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12);
+       
    // load initial state 
    __m128i abcd = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)&state[0]), _MM_SHUFFLE(0,1,2,3)); // [a,b,c,d]
    __m128i efgh = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)&state[4]), _MM_SHUFFLE(0,1,2,3)); // [e,f,g,h]
@@ -200,18 +237,18 @@ static void sha256_process_shani(uint32_t* state, const uint8_t* block, size_t c
        __m128i last0 = state0;
        __m128i last1 = state1;

-        __m128i tmp, w[4];
+        __m128i w[4];

-        QROUND(0);
-        QROUND(1);
-        QROUND(2);
-        QROUND(3);
-        QROUND(4);
-        QROUND(5);
-        QROUND(6);
-        QROUND(7);
-        QROUND(8);
-        QROUND(9);
+        QROUND( 0);
+        QROUND( 1);
+        QROUND( 2);
+        QROUND( 3);
+        QROUND( 4);
+        QROUND( 5);
+        QROUND( 6);
+        QROUND( 7);
+        QROUND( 8);
+        QROUND( 9);
        QROUND(10);
        QROUND(11);
        QROUND(12);
@@ -241,6 +278,140 @@ static void sha256_process_shani(uint32_t* state, const uint8_t* block, size_t c

 #endif // defined(__x86_64__) || defined(_M_AMD64)

+#if defined(__aarch64__) || defined(_M_ARM64)
+
+#if defined(__clang__)
+#   define SHA256_TARGET __attribute__((target("sha2")))
+#elif defined(__GNUC__)
+#   define SHA256_TARGET __attribute__((target("+sha2")))
+#elif defined(_MSC_VER)
+#   define SHA256_TARGET
+#endif
+
+#include <arm_neon.h>
+
+#if defined(_WIN32)
+#   include <windows.h>
+#elif defined(__linux__)
+#   include <sys/auxv.h>
+#   include <asm/hwcap.h>
+#elif defined(__APPLE__)
+#   include <sys/sysctl.h>
+#endif
+
+#define SHA256_CPUID_INIT  (1 << 0)
+#define SHA256_CPUID_ARM64 (1 << 1)
+
+static inline int sha256_cpuid(void)
+{
+#if defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_SHA2)
+    int result = SHA256_CPUID_ARM64;
+#else
+    static int cpuid;
+
+    int result = cpuid;
+    if (result == 0)
+    {
+#if defined(_WIN32)
+        int has_arm64 = IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
+#elif defined(__linux__)
+        unsigned long hwcap = getauxval(AT_HWCAP);
+        int has_arm64 = hwcap & HWCAP_SHA2;
+#elif defined(__APPLE__)
+        int value = 0;
+        size_t valuelen = sizeof(value);
+        int has_arm64 = sysctlbyname("hw.optional.arm.FEAT_SHA256", &value, &valuelen, NULL, 0) == 0 && value != 0;
+#else
+#error unknown platform
+#endif
+        result |= SHA256_CPUID_INIT;
+        if (has_arm64)
+        {
+            result |= SHA256_CPUID_ARM64;
+        }
+
+        cpuid = result;
+    }
+#endif
+
+#if defined(SHA256_CPUID_MASK)
+    result &= SHA256_CPUID_MASK;
+#endif
+
+    return result;
+}
+
+SHA256_TARGET
+static void sha256_process_arm64(uint32_t* state, const uint8_t* block, size_t count)
+{
+    // code here is similar to x64 shani implementation
+
+    #define W(i) w[(i)%4]
+
+    #define QROUND(i) do {                                                          \
+        /* load 16 round constants */                                               \
+        if ((i % 4) == 0) rk = vld1q_u32_x4(&SHA256_K[4*i]);                        \
+        /* first 4 rounds reverse byte order in each 32-bit lane of input block */  \
+        if (i <  4) W(i) = vreinterpretq_u32_u8(vrev32q_u8(msg.val[i]));            \
+        /* update message schedule */                                               \
+        if (i >= 4) W(i) = vsha256su0q_u32(W(i), W(i-3));                           \
+        if (i >= 4) W(i) = vsha256su1q_u32(W(i), W(i-2), W(i-1));                   \
+        /* add round constants */                                                   \
+        uint32x4_t tmp = vaddq_u32(W(i), rk.val[i%4]);                              \
+        /* 4 round functions */                                                     \
+        uint32x4_t x = vstate.val[0];                                               \
+        vstate.val[0] = vsha256hq_u32(vstate.val[0], vstate.val[1], tmp);           \
+        vstate.val[1] = vsha256h2q_u32(vstate.val[1], x, tmp);                      \
+    } while (0)
+
+    // load initial state
+    uint32x4x2_t vstate = vld1q_u32_x2(state);
+
+    do
+    {
+        // remember current state
+        uint32x4x2_t vlast = vstate;
+
+        // load 64-byte block
+        uint8x16x4_t msg = vld1q_u8_x4(block);
+
+        uint32x4x4_t rk;
+        uint32x4_t w[4];
+
+        QROUND( 0);
+        QROUND( 1);
+        QROUND( 2);
+        QROUND( 3);
+        QROUND( 4);
+        QROUND( 5);
+        QROUND( 6);
+        QROUND( 7);
+        QROUND( 8);
+        QROUND( 9);
+        QROUND(10);
+        QROUND(11);
+        QROUND(12);
+        QROUND(13);
+        QROUND(14);
+        QROUND(15);
+
+        // update next state
+        vstate.val[0] = vaddq_u32(vstate.val[0], vlast.val[0]);
+        vstate.val[1] = vaddq_u32(vstate.val[1], vlast.val[1]);
+
+        block += SHA256_BLOCK_SIZE;
+    }
+    while (--count);
+
+    // save the new state
+    vst1q_u32_x2(state, vstate);
+
+    #undef QROUND
+    #undef W
+}
+
+#endif // defined(__aarch64__) || defined(_M_ARM64)
+
 static void sha256_process(uint32_t* state, const uint8_t* block, size_t count)
 {
 #if defined(__x86_64__) || defined(_M_AMD64)
@@ -252,6 +423,15 @@ static void sha256_process(uint32_t* state, const uint8_t* block, size_t count)
    }
 #endif

+#if defined(__aarch64__) || defined(_M_ARM64)
+    int cpuid = sha256_cpuid();
+    if (cpuid & SHA256_CPUID_ARM64)
+    {
+        sha256_process_arm64(state, block, count);
+        return;
+    }
+#endif
+
    #define Ch(x,y,z)  ((x & (y ^ z)) ^ z)
    #define Maj(x,y,z) ((x & y) | (z & (x | y)))

@@ -262,13 +442,13 @@ static void sha256_process(uint32_t* state, const uint8_t* block, size_t count)

    #define W(i) w[(i+16)%16]

-    #define ROUND(i,a,b,c,d,e,f,g,h,K) do                                           \
+    #define ROUND(i,a,b,c,d,e,f,g,h) do                                             \
    {                                                                               \
        uint32_t w0;                                                                \
        if (i <  16) W(i) = w0 = SHA256_GET32BE(block + i*sizeof(uint32_t));        \
        if (i >= 16) W(i) = w0 = SSig1(W(i-2)) + W(i-7) + SSig0(W(i-15)) + W(i-16); \
                                                                                    \
-        uint32_t t1 = h + BSig1(e) + Ch(e,f,g) + K + w0;                            \
+        uint32_t t1 = h + BSig1(e) + Ch(e,f,g) + SHA256_K[i] + w0;                  \
        uint32_t t2 = BSig0(a) + Maj(a,b,c);                                        \
        d += t1;                                                                    \
        h = t1 + t2;                                                                \
@@ -287,70 +467,70 @@ static void sha256_process(uint32_t* state, const uint8_t* block, size_t count)

        uint32_t w[16];

-        ROUND( 0, a, b, c, d, e, f, g, h, 0x428a2f98);
-        ROUND( 1, h, a, b, c, d, e, f, g, 0x71374491);
-        ROUND( 2, g, h, a, b, c, d, e, f, 0xb5c0fbcf);
-        ROUND( 3, f, g, h, a, b, c, d, e, 0xe9b5dba5);
-        ROUND( 4, e, f, g, h, a, b, c, d, 0x3956c25b);
-        ROUND( 5, d, e, f, g, h, a, b, c, 0x59f111f1);
-        ROUND( 6, c, d, e, f, g, h, a, b, 0x923f82a4);
-        ROUND( 7, b, c, d, e, f, g, h, a, 0xab1c5ed5);
-        ROUND( 8, a, b, c, d, e, f, g, h, 0xd807aa98);
-        ROUND( 9, h, a, b, c, d, e, f, g, 0x12835b01);
-        ROUND(10, g, h, a, b, c, d, e, f, 0x243185be);
-        ROUND(11, f, g, h, a, b, c, d, e, 0x550c7dc3);
-        ROUND(12, e, f, g, h, a, b, c, d, 0x72be5d74);
-        ROUND(13, d, e, f, g, h, a, b, c, 0x80deb1fe);
-        ROUND(14, c, d, e, f, g, h, a, b, 0x9bdc06a7);
-        ROUND(15, b, c, d, e, f, g, h, a, 0xc19bf174);
-        ROUND(16, a, b, c, d, e, f, g, h, 0xe49b69c1);
-        ROUND(17, h, a, b, c, d, e, f, g, 0xefbe4786);
-        ROUND(18, g, h, a, b, c, d, e, f, 0x0fc19dc6);
-        ROUND(19, f, g, h, a, b, c, d, e, 0x240ca1cc);
-        ROUND(20, e, f, g, h, a, b, c, d, 0x2de92c6f);
-        ROUND(21, d, e, f, g, h, a, b, c, 0x4a7484aa);
-        ROUND(22, c, d, e, f, g, h, a, b, 0x5cb0a9dc);
-        ROUND(23, b, c, d, e, f, g, h, a, 0x76f988da);
-        ROUND(24, a, b, c, d, e, f, g, h, 0x983e5152);
-        ROUND(25, h, a, b, c, d, e, f, g, 0xa831c66d);
-        ROUND(26, g, h, a, b, c, d, e, f, 0xb00327c8);
-        ROUND(27, f, g, h, a, b, c, d, e, 0xbf597fc7);
-        ROUND(28, e, f, g, h, a, b, c, d, 0xc6e00bf3);
-        ROUND(29, d, e, f, g, h, a, b, c, 0xd5a79147);
-        ROUND(30, c, d, e, f, g, h, a, b, 0x06ca6351);
-        ROUND(31, b, c, d, e, f, g, h, a, 0x14292967);
-        ROUND(32, a, b, c, d, e, f, g, h, 0x27b70a85);
-        ROUND(33, h, a, b, c, d, e, f, g, 0x2e1b2138);
-        ROUND(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc);
-        ROUND(35, f, g, h, a, b, c, d, e, 0x53380d13);
-        ROUND(36, e, f, g, h, a, b, c, d, 0x650a7354);
-        ROUND(37, d, e, f, g, h, a, b, c, 0x766a0abb);
-        ROUND(38, c, d, e, f, g, h, a, b, 0x81c2c92e);
-        ROUND(39, b, c, d, e, f, g, h, a, 0x92722c85);
-        ROUND(40, a, b, c, d, e, f, g, h, 0xa2bfe8a1);
-        ROUND(41, h, a, b, c, d, e, f, g, 0xa81a664b);
-        ROUND(42, g, h, a, b, c, d, e, f, 0xc24b8b70);
-        ROUND(43, f, g, h, a, b, c, d, e, 0xc76c51a3);
-        ROUND(44, e, f, g, h, a, b, c, d, 0xd192e819);
-        ROUND(45, d, e, f, g, h, a, b, c, 0xd6990624);
-        ROUND(46, c, d, e, f, g, h, a, b, 0xf40e3585);
-        ROUND(47, b, c, d, e, f, g, h, a, 0x106aa070);
-        ROUND(48, a, b, c, d, e, f, g, h, 0x19a4c116);
-        ROUND(49, h, a, b, c, d, e, f, g, 0x1e376c08);
-        ROUND(50, g, h, a, b, c, d, e, f, 0x2748774c);
-        ROUND(51, f, g, h, a, b, c, d, e, 0x34b0bcb5);
-        ROUND(52, e, f, g, h, a, b, c, d, 0x391c0cb3);
-        ROUND(53, d, e, f, g, h, a, b, c, 0x4ed8aa4a);
-        ROUND(54, c, d, e, f, g, h, a, b, 0x5b9cca4f);
-        ROUND(55, b, c, d, e, f, g, h, a, 0x682e6ff3);
-        ROUND(56, a, b, c, d, e, f, g, h, 0x748f82ee);
-        ROUND(57, h, a, b, c, d, e, f, g, 0x78a5636f);
-        ROUND(58, g, h, a, b, c, d, e, f, 0x84c87814);
-        ROUND(59, f, g, h, a, b, c, d, e, 0x8cc70208);
-        ROUND(60, e, f, g, h, a, b, c, d, 0x90befffa);
-        ROUND(61, d, e, f, g, h, a, b, c, 0xa4506ceb);
-        ROUND(62, c, d, e, f, g, h, a, b, 0xbef9a3f7);
-        ROUND(63, b, c, d, e, f, g, h, a, 0xc67178f2);
+        ROUND( 0, a, b, c, d, e, f, g, h);
+        ROUND( 1, h, a, b, c, d, e, f, g);
+        ROUND( 2, g, h, a, b, c, d, e, f);
+        ROUND( 3, f, g, h, a, b, c, d, e);
+        ROUND( 4, e, f, g, h, a, b, c, d);
+        ROUND( 5, d, e, f, g, h, a, b, c);
+        ROUND( 6, c, d, e, f, g, h, a, b);
+        ROUND( 7, b, c, d, e, f, g, h, a);
+        ROUND( 8, a, b, c, d, e, f, g, h);
+        ROUND( 9, h, a, b, c, d, e, f, g);
+        ROUND(10, g, h, a, b, c, d, e, f);
+        ROUND(11, f, g, h, a, b, c, d, e);
+        ROUND(12, e, f, g, h, a, b, c, d);
+        ROUND(13, d, e, f, g, h, a, b, c);
+        ROUND(14, c, d, e, f, g, h, a, b);
+        ROUND(15, b, c, d, e, f, g, h, a);
+        ROUND(16, a, b, c, d, e, f, g, h);
+        ROUND(17, h, a, b, c, d, e, f, g);
+        ROUND(18, g, h, a, b, c, d, e, f);
+        ROUND(19, f, g, h, a, b, c, d, e);
+        ROUND(20, e, f, g, h, a, b, c, d);
+        ROUND(21, d, e, f, g, h, a, b, c);
+        ROUND(22, c, d, e, f, g, h, a, b);
+        ROUND(23, b, c, d, e, f, g, h, a);
+        ROUND(24, a, b, c, d, e, f, g, h);
+        ROUND(25, h, a, b, c, d, e, f, g);
+        ROUND(26, g, h, a, b, c, d, e, f);
+        ROUND(27, f, g, h, a, b, c, d, e);
+        ROUND(28, e, f, g, h, a, b, c, d);
+        ROUND(29, d, e, f, g, h, a, b, c);
+        ROUND(30, c, d, e, f, g, h, a, b);
+        ROUND(31, b, c, d, e, f, g, h, a);
+        ROUND(32, a, b, c, d, e, f, g, h);
+        ROUND(33, h, a, b, c, d, e, f, g);
+        ROUND(34, g, h, a, b, c, d, e, f);
+        ROUND(35, f, g, h, a, b, c, d, e);
+        ROUND(36, e, f, g, h, a, b, c, d);
+        ROUND(37, d, e, f, g, h, a, b, c);
+        ROUND(38, c, d, e, f, g, h, a, b);
+        ROUND(39, b, c, d, e, f, g, h, a);
+        ROUND(40, a, b, c, d, e, f, g, h);
+        ROUND(41, h, a, b, c, d, e, f, g);
+        ROUND(42, g, h, a, b, c, d, e, f);
+        ROUND(43, f, g, h, a, b, c, d, e);
+        ROUND(44, e, f, g, h, a, b, c, d);
+        ROUND(45, d, e, f, g, h, a, b, c);
+        ROUND(46, c, d, e, f, g, h, a, b);
+        ROUND(47, b, c, d, e, f, g, h, a);
+        ROUND(48, a, b, c, d, e, f, g, h);
+        ROUND(49, h, a, b, c, d, e, f, g);
+        ROUND(50, g, h, a, b, c, d, e, f);
+        ROUND(51, f, g, h, a, b, c, d, e);
+        ROUND(52, e, f, g, h, a, b, c, d);
+        ROUND(53, d, e, f, g, h, a, b, c);
+        ROUND(54, c, d, e, f, g, h, a, b);
+        ROUND(55, b, c, d, e, f, g, h, a);
+        ROUND(56, a, b, c, d, e, f, g, h);
+        ROUND(57, h, a, b, c, d, e, f, g);
+        ROUND(58, g, h, a, b, c, d, e, f);
+        ROUND(59, f, g, h, a, b, c, d, e);
+        ROUND(60, e, f, g, h, a, b, c, d);
+        ROUND(61, d, e, f, g, h, a, b, c);
+        ROUND(62, c, d, e, f, g, h, a, b);
+        ROUND(63, b, c, d, e, f, g, h, a);

        state[0] += a;
        state[1] += b;
@@ -58,8 +58,8 @@ static inline void sha384_finish(sha384_ctx* ctx, uint8_t digest[SHA384_DIGEST_S

 #if defined(_MSC_VER)
 #   include <stdlib.h>
-#   define SHA512_GET64BE(ptr) _byteswap_uint64( *((const _UNALIGNED uint64_t*)(ptr)) )
-#   define SHA512_SET64BE(ptr,x) *((_UNALIGNED uint64_t*)(ptr)) = _byteswap_uint64(x)
+#   define SHA512_GET64BE(ptr) _byteswap_uint64( *((const __unaligned uint64_t*)(ptr)) )
+#   define SHA512_SET64BE(ptr,x) *((__unaligned uint64_t*)(ptr)) = _byteswap_uint64(x)
 #else
 #   define SHA512_GET64BE(ptr)              \
    (                                       \
@@ -86,9 +86,33 @@ static inline void sha384_finish(sha384_ctx* ctx, uint8_t digest[SHA384_DIGEST_S
    while (0)
 #endif

+static const uint64_t SHA512_K[80] =
+{
+    0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
+    0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
+    0xd807aa98a3030242, 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
+    0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, 0xc19bf174cf692694,
+    0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65,
+    0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
+    0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, 0xbf597fc7beef0ee4,
+    0xc6e00bf33da88fc2, 0xd5a79147930aa725, 0x06ca6351e003826f, 0x142929670a0e6e70,
+    0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
+    0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 0x92722c851482353b,
+    0xa2bfe8a14cf10364, 0xa81a664bbc423001, 0xc24b8b70d0f89791, 0xc76c51a30654be30,
+    0xd192e819d6ef5218, 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8,
+    0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,
+    0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,
+    0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec,
+    0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 0xc67178f2e372532b,
+    0xca273eceea26619c, 0xd186b8c721c0c207, 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,
+    0x06f067aa72176fba, 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b,
+    0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
+    0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817,
+};
+
 #if defined(__x86_64__) || defined(_M_AMD64)

-#include <immintrin.h>
+#include <immintrin.h> // AVX2 + SHA512

 #if defined(__clang__) || defined(__GNUC__)
 #   include <cpuid.h>
@@ -117,7 +141,7 @@ static inline int sha512_cpuid(void)
    {
        int info[4];

-        SHA256_CPUID(1, info);
+        SHA512_CPUID(1, info);
        int has_xsave = info[2] & (1 << 26);

        int has_ymm = 0;
@@ -127,13 +151,13 @@ static inline int sha512_cpuid(void)
            has_ymm = xcr0 & (1 << 2);
        }

-        SHA256_CPUID_EX(7, 0, info);
+        SHA512_CPUID_EX(7, 0, info);
        int has_avx2 = info[1] & (1 << 5);

-        SHA256_CPUID_EX(7, 1, info);
+        SHA512_CPUID_EX(7, 1, info);
        int has_sha512 = info[0] & (1 << 0);

-        result |= SHA256_CPUID_INIT;
+        result |= SHA512_CPUID_INIT;
        if (has_ymm && has_avx2 && has_sha512)
        {
            result |= SHA512_CPUID_VSHA512;
@@ -152,51 +176,32 @@ static inline int sha512_cpuid(void)
 SHA512_TARGET("avx2,sha512")
 static void sha512_process_vsha512(uint64_t* state, const uint8_t* block, size_t count)
 {
-    const __m256i* buffer = (const __m256i*)block;
+    // pretty much same way how sha256 works, only with avx2 registers and 64-bit additions
+    // state is kept as two 256-bit ymm registers (8 qwords)

-    // to byteswap when doing big-ending load for message qwords
-    const __m256i bswap = _mm256_broadcastsi128_si256(_mm_setr_epi8(7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9,8));
-
-    static const uint64_t K[20][4] =
-    {
-        { 0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc },
-        { 0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 },
-        { 0xd807aa98a3030242, 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 },
-        { 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, 0xc19bf174cf692694 },
-        { 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 },
-        { 0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 },
-        { 0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, 0xbf597fc7beef0ee4 },
-        { 0xc6e00bf33da88fc2, 0xd5a79147930aa725, 0x06ca6351e003826f, 0x142929670a0e6e70 },
-        { 0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df },
-        { 0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 0x92722c851482353b },
-        { 0xa2bfe8a14cf10364, 0xa81a664bbc423001, 0xc24b8b70d0f89791, 0xc76c51a30654be30 },
-        { 0xd192e819d6ef5218, 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8 },
-        { 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 },
-        { 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 },
-        { 0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec },
-        { 0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 0xc67178f2e372532b },
-        { 0xca273eceea26619c, 0xd186b8c721c0c207, 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 },
-        { 0x06f067aa72176fba, 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b },
-        { 0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c },
-        { 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 },
-    };
+    // message qwords are loaded as 64-bit big-endian values

    #define W(i) w[(i)%4]

    // 4 wide round calculations
    #define QROUND(i) do {                                                                                                                                                          \
-        /* first four rounds loads input message */                                                                                                                                 \
+        /* first 4 rounds load input block */                                                                                                                                       \
        if (i < 4) W(i) = _mm256_shuffle_epi8(_mm256_loadu_si256(&buffer[i]), bswap);                                                                                               \
-        /* add round constant */                                                                                                                                                    \
-        tmp = _mm256_add_epi64(W(i), _mm256_loadu_si256((const __m256i*)K[i]));                                                                                                     \
-        /* update previous message qwords for next rounds */                                                                                                                        \
+        /* update message schedule */                                                                                                                                               \
        if (i > 2 && i < 19) W(i-3) = _mm256_sha512msg2_epi64(_mm256_add_epi64(W(i-3), _mm256_permute4x64_epi64(_mm256_blend_epi32(W(i-1), W(i), 3), _MM_SHUFFLE(0,3,2,1))), W(i)); \
        if (i > 0 && i < 17) W(i-1) = _mm256_sha512msg1_epi64(W(i-1), _mm256_castsi256_si128(W(i)));                                                                                \
+        /* add round constants */                                                                                                                                                   \
+        __m256i tmp = _mm256_add_epi64(W(i), _mm256_loadu_si256((const __m256i*)&SHA512_K[4*i]));                                                                                   \
        /* round functions */                                                                                                                                                       \
        state1 = _mm256_sha512rnds2_epi64(state1, state0, _mm256_castsi256_si128(tmp));                                                                                             \
        state0 = _mm256_sha512rnds2_epi64(state0, state1, _mm256_extracti128_si256(tmp, 1));                                                                                        \
    } while(0)

+    const __m256i* buffer = (const __m256i*)block;
+
+    // to byteswap when doing big-ending load for message qwords
+    const __m256i bswap = _mm256_broadcastsi128_si256(_mm_setr_epi8(7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9,8));
+
    // load initial state 
    __m256i abcd = _mm256_permute4x64_epi64(_mm256_loadu_si256((const __m256i*)&state[0]), _MM_SHUFFLE(0,1,2,3)); // [a,b,c,d]
    __m256i efgh = _mm256_permute4x64_epi64(_mm256_loadu_si256((const __m256i*)&state[4]), _MM_SHUFFLE(0,1,2,3)); // [e,f,g,h]
@@ -211,7 +216,7 @@ static void sha512_process_vsha512(uint64_t* state, const uint8_t* block, size_t
        __m256i last0 = state0;
        __m256i last1 = state1;

-        __m256i tmp, w[4];
+        __m256i w[4];

        QROUND(0);
        QROUND(1);
@@ -256,6 +261,189 @@ static void sha512_process_vsha512(uint64_t* state, const uint8_t* block, size_t

 #endif // defined(__x86_64__) || defined(_M_AMD64)

+#if defined(__aarch64__) || defined(_M_ARM64)
+
+#if defined(__clang__)
+#   define SHA512_TARGET __attribute__((target("sha3")))
+#elif defined(__GNUC__)
+#   define SHA512_TARGET __attribute__((target("+sha3")))
+#elif defined(_MSC_VER)
+#   define SHA512_TARGET
+#endif
+
+#include <arm_neon.h>
+
+#if defined(_WIN32)
+#   include <windows.h>
+#   pragma comment (lib, "advapi32")
+#elif defined(__linux__)
+#   include <sys/auxv.h>
+#   include <asm/hwcap.h>
+#elif defined(__APPLE__)
+#   include <sys/sysctl.h>
+#endif
+
+#define SHA512_CPUID_INIT  (1 << 0)
+#define SHA512_CPUID_ARM64 (1 << 1)
+
+#if defined(_WIN32)
+
+#endif
+
+static inline int sha512_cpuid(void)
+{
+#if defined(__ARM_FEATURE_SHA512)
+    int result = SHA512_CPUID_ARM64;
+#else
+    static int cpuid;
+
+    int result = cpuid;
+    if (result == 0)
+    {
+#if defined(_WIN32)
+        // no sha512 bit in IsProcessorFeaturePresent function :(
+        uint64_t bits;
+        DWORD bitsize = sizeof(bits);
+        RegGetValueA(HKEY_LOCAL_MACHINE, "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0", "CP 4030", RRF_RT_QWORD | RRF_ZEROONFAILURE, NULL, &bits, &bitsize);
+        // bits from ID_AA64ISAR0_EL1
+        int has_arm64 = ((bits >> 15) & 0xf) == 0x2;
+#elif defined(__linux__)
+        unsigned long hwcap = getauxval(AT_HWCAP);
+        int has_arm64 = hwcap & HWCAP_SHA512;
+#elif defined(__APPLE__)
+        int value = 0;
+        size_t valuelen = sizeof(value);
+        int has_arm64 = sysctlbyname("hw.optional.arm.FEAT_SHA512", &value, &valuelen, NULL, 0) == 0 && value != 0;
+#else
+#error unknown platform
+#endif
+        result |= SHA512_CPUID_INIT;
+        if (has_arm64)
+        {
+            result |= SHA512_CPUID_ARM64;
+        }
+
+        cpuid = result;
+    }
+#endif
+
+#if defined(SHA512_CPUID_MASK)
+    result &= SHA512_CPUID_MASK;
+#endif
+
+    return result;
+}
+
+SHA512_TARGET
+static void sha512_process_arm64(uint64_t* state, const uint8_t* block, size_t count)
+{
+    #define W(i) w[(i)%8]
+    #define S(i) vstate.val[3-(i)%4]
+
+    #define DROUND(i) do {                                                                                          \
+        /* load 8 round constants */                                                                                \
+        if ((i % 4) == 0) rk = vld1q_u64_x4(&SHA512_K[2*i]);                                                        \
+        /* first 8 rounds reverse byte order in each 64-bit lane of input block */                                  \
+        if (i <  8) W(i) = vreinterpretq_u64_u8(vrev64q_u8(msg[(i/4)%2].val[i%4]));                                 \
+        /* update message schedule for next rounds */                                                               \
+        if (i >= 8) W(i) = vsha512su1q_u64(vsha512su0q_u64(W(i), W(i-7)), W(i-1), vextq_u64(W(i-4), W(i-3), 1));    \
+        /* add round constants */                                                                                   \
+        uint64x2_t tmp = vaddq_u64(W(i), rk.val[i%4]);                                                              \
+        /* 2 round functions */                                                                                     \
+        uint64x2_t x0 = vaddq_u64(vextq_u64(tmp, tmp, 1), S(i+0));                                                  \
+        uint64x2_t x1 = vsha512hq_u64(x0, vextq_u64(S(i+1), S(i+0), 1), vextq_u64(S(i+2), S(i+1), 1));              \
+        S(i+0) = vsha512h2q_u64(x1, S(i+2), S(i+3));                                                                \
+        S(i+2) = vaddq_u64(S(i+2), x1);                                                                             \
+    } while (0)
+
+    // load initial state
+    uint64x2x4_t vstate = vld1q_u64_x4(state);
+
+    do
+    {
+        // remember current state
+        uint64x2x4_t vlast = vstate;
+
+        // load 128-byte block
+        uint8x16x4_t msg[2] =
+        {
+            vld1q_u8_x4(block + 0 * 16),
+            vld1q_u8_x4(block + 4 * 16),
+        };
+
+        uint64x2x4_t rk;
+        uint64x2_t w[8];
+
+        DROUND( 0);
+        DROUND( 1);
+        DROUND( 2);
+        DROUND( 3);
+
+        DROUND( 4);
+        DROUND( 5);
+        DROUND( 6);
+        DROUND( 7);
+
+        DROUND( 8);
+        DROUND( 9);
+        DROUND(10);
+        DROUND(11);
+
+        DROUND(12);
+        DROUND(13);
+        DROUND(14);
+        DROUND(15);
+
+        DROUND(16);
+        DROUND(17);
+        DROUND(18);
+        DROUND(19);
+
+        DROUND(20);
+        DROUND(21);
+        DROUND(22);
+        DROUND(23);
+
+        DROUND(24);
+        DROUND(25);
+        DROUND(26);
+        DROUND(27);
+
+        DROUND(28);
+        DROUND(29);
+        DROUND(30);
+        DROUND(31);
+
+        DROUND(32);
+        DROUND(33);
+        DROUND(34);
+        DROUND(35);
+
+        DROUND(36);
+        DROUND(37);
+        DROUND(38);
+        DROUND(39);
+
+        // update next state
+        vstate.val[0] = vaddq_u64(vstate.val[0], vlast.val[0]);
+        vstate.val[1] = vaddq_u64(vstate.val[1], vlast.val[1]);
+        vstate.val[2] = vaddq_u64(vstate.val[2], vlast.val[2]);
+        vstate.val[3] = vaddq_u64(vstate.val[3], vlast.val[3]);
+
+        block += SHA512_BLOCK_SIZE;
+    }
+    while (--count);
+
+    // save the new state
+    vst1q_u64_x4(state, vstate);
+
+    #undef DROUND
+    #undef S
+    #undef W
+}
+
+#endif // defined(__aarch64__) || defined(_M_ARM64)
+
 static void sha512_process(uint64_t* state, const uint8_t* block, size_t count)
 {
 #if defined(__x86_64__) || defined(_M_AMD64)
@@ -267,6 +455,15 @@ static void sha512_process(uint64_t* state, const uint8_t* block, size_t count)
    }
 #endif

+#if defined(__aarch64__) || defined(_M_ARM64)
+    int cpuid = sha512_cpuid();
+    if (cpuid & SHA512_CPUID_ARM64)
+    {
+        sha512_process_arm64(state, block, count);
+        return;
+    }
+#endif
+
    #define Ch(x,y,z)  ((x & (y ^ z)) ^ z)
    #define Maj(x,y,z) ((x & y) | (z & (x | y)))

@@ -277,13 +474,13 @@ static void sha512_process(uint64_t* state, const uint8_t* block, size_t count)

    #define W(i) w[(i+16)%16]

-    #define ROUND(i,a,b,c,d,e,f,g,h,K) do                                           \
+    #define ROUND(i,a,b,c,d,e,f,g,h) do                                             \
    {                                                                               \
        uint64_t w0;                                                                \
        if (i <  16) W(i) = w0 = SHA512_GET64BE(block + i*sizeof(uint64_t));        \
        if (i >= 16) W(i) = w0 = SSig1(W(i-2)) + W(i-7) + SSig0(W(i-15)) + W(i-16); \
                                                                                    \
-        uint64_t t1 = h + BSig1(e) + Ch(e,f,g) + K + w0;                            \
+        uint64_t t1 = h + BSig1(e) + Ch(e,f,g) + SHA512_K[i] + w0;                  \
        uint64_t t2 = BSig0(a) + Maj(a,b,c);                                        \
        d += t1;                                                                    \
        h = t1 + t2;                                                                \
@@ -302,86 +499,86 @@ static void sha512_process(uint64_t* state, const uint8_t* block, size_t count)

        uint64_t w[16];

-        ROUND( 0, a, b, c, d, e, f, g, h, 0x428a2f98d728ae22);
-        ROUND( 1, h, a, b, c, d, e, f, g, 0x7137449123ef65cd);
-        ROUND( 2, g, h, a, b, c, d, e, f, 0xb5c0fbcfec4d3b2f);
-        ROUND( 3, f, g, h, a, b, c, d, e, 0xe9b5dba58189dbbc);
-        ROUND( 4, e, f, g, h, a, b, c, d, 0x3956c25bf348b538);
-        ROUND( 5, d, e, f, g, h, a, b, c, 0x59f111f1b605d019);
-        ROUND( 6, c, d, e, f, g, h, a, b, 0x923f82a4af194f9b);
-        ROUND( 7, b, c, d, e, f, g, h, a, 0xab1c5ed5da6d8118);
-        ROUND( 8, a, b, c, d, e, f, g, h, 0xd807aa98a3030242);
-        ROUND( 9, h, a, b, c, d, e, f, g, 0x12835b0145706fbe);
-        ROUND(10, g, h, a, b, c, d, e, f, 0x243185be4ee4b28c);
-        ROUND(11, f, g, h, a, b, c, d, e, 0x550c7dc3d5ffb4e2);
-        ROUND(12, e, f, g, h, a, b, c, d, 0x72be5d74f27b896f);
-        ROUND(13, d, e, f, g, h, a, b, c, 0x80deb1fe3b1696b1);
-        ROUND(14, c, d, e, f, g, h, a, b, 0x9bdc06a725c71235);
-        ROUND(15, b, c, d, e, f, g, h, a, 0xc19bf174cf692694);
-        ROUND(16, a, b, c, d, e, f, g, h, 0xe49b69c19ef14ad2);
-        ROUND(17, h, a, b, c, d, e, f, g, 0xefbe4786384f25e3);
-        ROUND(18, g, h, a, b, c, d, e, f, 0x0fc19dc68b8cd5b5);
-        ROUND(19, f, g, h, a, b, c, d, e, 0x240ca1cc77ac9c65);
-        ROUND(20, e, f, g, h, a, b, c, d, 0x2de92c6f592b0275);
-        ROUND(21, d, e, f, g, h, a, b, c, 0x4a7484aa6ea6e483);
-        ROUND(22, c, d, e, f, g, h, a, b, 0x5cb0a9dcbd41fbd4);
-        ROUND(23, b, c, d, e, f, g, h, a, 0x76f988da831153b5);
-        ROUND(24, a, b, c, d, e, f, g, h, 0x983e5152ee66dfab);
-        ROUND(25, h, a, b, c, d, e, f, g, 0xa831c66d2db43210);
-        ROUND(26, g, h, a, b, c, d, e, f, 0xb00327c898fb213f);
-        ROUND(27, f, g, h, a, b, c, d, e, 0xbf597fc7beef0ee4);
-        ROUND(28, e, f, g, h, a, b, c, d, 0xc6e00bf33da88fc2);
-        ROUND(29, d, e, f, g, h, a, b, c, 0xd5a79147930aa725);
-        ROUND(30, c, d, e, f, g, h, a, b, 0x06ca6351e003826f);
-        ROUND(31, b, c, d, e, f, g, h, a, 0x142929670a0e6e70);
-        ROUND(32, a, b, c, d, e, f, g, h, 0x27b70a8546d22ffc);
-        ROUND(33, h, a, b, c, d, e, f, g, 0x2e1b21385c26c926);
-        ROUND(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc5ac42aed);
-        ROUND(35, f, g, h, a, b, c, d, e, 0x53380d139d95b3df);
-        ROUND(36, e, f, g, h, a, b, c, d, 0x650a73548baf63de);
-        ROUND(37, d, e, f, g, h, a, b, c, 0x766a0abb3c77b2a8);
-        ROUND(38, c, d, e, f, g, h, a, b, 0x81c2c92e47edaee6);
-        ROUND(39, b, c, d, e, f, g, h, a, 0x92722c851482353b);
-        ROUND(40, a, b, c, d, e, f, g, h, 0xa2bfe8a14cf10364);
-        ROUND(41, h, a, b, c, d, e, f, g, 0xa81a664bbc423001);
-        ROUND(42, g, h, a, b, c, d, e, f, 0xc24b8b70d0f89791);
-        ROUND(43, f, g, h, a, b, c, d, e, 0xc76c51a30654be30);
-        ROUND(44, e, f, g, h, a, b, c, d, 0xd192e819d6ef5218);
-        ROUND(45, d, e, f, g, h, a, b, c, 0xd69906245565a910);
-        ROUND(46, c, d, e, f, g, h, a, b, 0xf40e35855771202a);
-        ROUND(47, b, c, d, e, f, g, h, a, 0x106aa07032bbd1b8);
-        ROUND(48, a, b, c, d, e, f, g, h, 0x19a4c116b8d2d0c8);
-        ROUND(49, h, a, b, c, d, e, f, g, 0x1e376c085141ab53);
-        ROUND(50, g, h, a, b, c, d, e, f, 0x2748774cdf8eeb99);
-        ROUND(51, f, g, h, a, b, c, d, e, 0x34b0bcb5e19b48a8);
-        ROUND(52, e, f, g, h, a, b, c, d, 0x391c0cb3c5c95a63);
-        ROUND(53, d, e, f, g, h, a, b, c, 0x4ed8aa4ae3418acb);
-        ROUND(54, c, d, e, f, g, h, a, b, 0x5b9cca4f7763e373);
-        ROUND(55, b, c, d, e, f, g, h, a, 0x682e6ff3d6b2b8a3);
-        ROUND(56, a, b, c, d, e, f, g, h, 0x748f82ee5defb2fc);
-        ROUND(57, h, a, b, c, d, e, f, g, 0x78a5636f43172f60);
-        ROUND(58, g, h, a, b, c, d, e, f, 0x84c87814a1f0ab72);
-        ROUND(59, f, g, h, a, b, c, d, e, 0x8cc702081a6439ec);
-        ROUND(60, e, f, g, h, a, b, c, d, 0x90befffa23631e28);
-        ROUND(61, d, e, f, g, h, a, b, c, 0xa4506cebde82bde9);
-        ROUND(62, c, d, e, f, g, h, a, b, 0xbef9a3f7b2c67915);
-        ROUND(63, b, c, d, e, f, g, h, a, 0xc67178f2e372532b);
-        ROUND(64, a, b, c, d, e, f, g, h, 0xca273eceea26619c);
-        ROUND(65, h, a, b, c, d, e, f, g, 0xd186b8c721c0c207);
-        ROUND(66, g, h, a, b, c, d, e, f, 0xeada7dd6cde0eb1e);
-        ROUND(67, f, g, h, a, b, c, d, e, 0xf57d4f7fee6ed178);
-        ROUND(68, e, f, g, h, a, b, c, d, 0x06f067aa72176fba);
-        ROUND(69, d, e, f, g, h, a, b, c, 0x0a637dc5a2c898a6);
-        ROUND(70, c, d, e, f, g, h, a, b, 0x113f9804bef90dae);
-        ROUND(71, b, c, d, e, f, g, h, a, 0x1b710b35131c471b);
-        ROUND(72, a, b, c, d, e, f, g, h, 0x28db77f523047d84);
-        ROUND(73, h, a, b, c, d, e, f, g, 0x32caab7b40c72493);
-        ROUND(74, g, h, a, b, c, d, e, f, 0x3c9ebe0a15c9bebc);
-        ROUND(75, f, g, h, a, b, c, d, e, 0x431d67c49c100d4c);
-        ROUND(76, e, f, g, h, a, b, c, d, 0x4cc5d4becb3e42b6);
-        ROUND(77, d, e, f, g, h, a, b, c, 0x597f299cfc657e2a);
-        ROUND(78, c, d, e, f, g, h, a, b, 0x5fcb6fab3ad6faec);
-        ROUND(79, b, c, d, e, f, g, h, a, 0x6c44198c4a475817);
+        ROUND( 0, a, b, c, d, e, f, g, h);
+        ROUND( 1, h, a, b, c, d, e, f, g);
+        ROUND( 2, g, h, a, b, c, d, e, f);
+        ROUND( 3, f, g, h, a, b, c, d, e);
+        ROUND( 4, e, f, g, h, a, b, c, d);
+        ROUND( 5, d, e, f, g, h, a, b, c);
+        ROUND( 6, c, d, e, f, g, h, a, b);
+        ROUND( 7, b, c, d, e, f, g, h, a);
+        ROUND( 8, a, b, c, d, e, f, g, h);
+        ROUND( 9, h, a, b, c, d, e, f, g);
+        ROUND(10, g, h, a, b, c, d, e, f);
+        ROUND(11, f, g, h, a, b, c, d, e);
+        ROUND(12, e, f, g, h, a, b, c, d);
+        ROUND(13, d, e, f, g, h, a, b, c);
+        ROUND(14, c, d, e, f, g, h, a, b);
+        ROUND(15, b, c, d, e, f, g, h, a);
+        ROUND(16, a, b, c, d, e, f, g, h);
+        ROUND(17, h, a, b, c, d, e, f, g);
+        ROUND(18, g, h, a, b, c, d, e, f);
+        ROUND(19, f, g, h, a, b, c, d, e);
+        ROUND(20, e, f, g, h, a, b, c, d);
+        ROUND(21, d, e, f, g, h, a, b, c);
+        ROUND(22, c, d, e, f, g, h, a, b);
+        ROUND(23, b, c, d, e, f, g, h, a);
+        ROUND(24, a, b, c, d, e, f, g, h);
+        ROUND(25, h, a, b, c, d, e, f, g);
+        ROUND(26, g, h, a, b, c, d, e, f);
+        ROUND(27, f, g, h, a, b, c, d, e);
+        ROUND(28, e, f, g, h, a, b, c, d);
+        ROUND(29, d, e, f, g, h, a, b, c);
+        ROUND(30, c, d, e, f, g, h, a, b);
+        ROUND(31, b, c, d, e, f, g, h, a);
+        ROUND(32, a, b, c, d, e, f, g, h);
+        ROUND(33, h, a, b, c, d, e, f, g);
+        ROUND(34, g, h, a, b, c, d, e, f);
+        ROUND(35, f, g, h, a, b, c, d, e);
+        ROUND(36, e, f, g, h, a, b, c, d);
+        ROUND(37, d, e, f, g, h, a, b, c);
+        ROUND(38, c, d, e, f, g, h, a, b);
+        ROUND(39, b, c, d, e, f, g, h, a);
+        ROUND(40, a, b, c, d, e, f, g, h);
+        ROUND(41, h, a, b, c, d, e, f, g);
+        ROUND(42, g, h, a, b, c, d, e, f);
+        ROUND(43, f, g, h, a, b, c, d, e);
+        ROUND(44, e, f, g, h, a, b, c, d);
+        ROUND(45, d, e, f, g, h, a, b, c);
+        ROUND(46, c, d, e, f, g, h, a, b);
+        ROUND(47, b, c, d, e, f, g, h, a);
+        ROUND(48, a, b, c, d, e, f, g, h);
+        ROUND(49, h, a, b, c, d, e, f, g);
+        ROUND(50, g, h, a, b, c, d, e, f);
+        ROUND(51, f, g, h, a, b, c, d, e);
+        ROUND(52, e, f, g, h, a, b, c, d);
+        ROUND(53, d, e, f, g, h, a, b, c);
+        ROUND(54, c, d, e, f, g, h, a, b);
+        ROUND(55, b, c, d, e, f, g, h, a);
+        ROUND(56, a, b, c, d, e, f, g, h);
+        ROUND(57, h, a, b, c, d, e, f, g);
+        ROUND(58, g, h, a, b, c, d, e, f);
+        ROUND(59, f, g, h, a, b, c, d, e);
+        ROUND(60, e, f, g, h, a, b, c, d);
+        ROUND(61, d, e, f, g, h, a, b, c);
+        ROUND(62, c, d, e, f, g, h, a, b);
+        ROUND(63, b, c, d, e, f, g, h, a);
+        ROUND(64, a, b, c, d, e, f, g, h);
+        ROUND(65, h, a, b, c, d, e, f, g);
+        ROUND(66, g, h, a, b, c, d, e, f);
+        ROUND(67, f, g, h, a, b, c, d, e);
+        ROUND(68, e, f, g, h, a, b, c, d);
+        ROUND(69, d, e, f, g, h, a, b, c);
+        ROUND(70, c, d, e, f, g, h, a, b);
+        ROUND(71, b, c, d, e, f, g, h, a);
+        ROUND(72, a, b, c, d, e, f, g, h);
+        ROUND(73, h, a, b, c, d, e, f, g);
+        ROUND(74, g, h, a, b, c, d, e, f);
+        ROUND(75, f, g, h, a, b, c, d, e);
+        ROUND(76, e, f, g, h, a, b, c, d);
+        ROUND(77, d, e, f, g, h, a, b, c);
+        ROUND(78, c, d, e, f, g, h, a, b);
+        ROUND(79, b, c, d, e, f, g, h, a);

        state[0] += a;
        state[1] += b;