diff --git a/src/base/base_core.h b/src/base/base_core.h index bd67f5c6..a6f41da6 100644 --- a/src/base/base_core.h +++ b/src/base/base_core.h @@ -84,6 +84,37 @@ # define C_LINKAGE #endif +//////////////////////////////// +//~ rjf: Optimization Settings + +#if COMPILER_MSVC +# define OPTIMIZE_BEGIN _Pragma("optimize(\"\", on)") +# define OPTIMIZE_END _Pragma("optimize(\"\", off)") +#elif COMPILER_CLANG +# define OPTIMIZE_BEGIN _Pragma("clang optimize on") +# define OPTIMIZE_END _Pragma("clang optimize off") +#elif COMPILER_GCC +# define OPTIMIZE_BEGIN _Pragma("GCC push_options") _Pragma("GCC optimize(\"O2\")") +# define OPTIMIZE_END _Pragma("GCC pop_options") +#else +# define OPTIMIZE_BEGIN +# define OPTIMIZE_END +#endif + +#if COMPILER_MSVC && !BUILD_DEBUG +# define NO_OPTIMIZE_BEGIN _Pragma("optimize(\"\", off)") +# define NO_OPTIMIZE_END _Pragma("optimize(\"\", on)") +#elif COMPILER_CLANG && !BUILD_DEBUG +# define NO_OPTIMIZE_BEGIN _Pragma("clang optimize off") +# define NO_OPTIMIZE_END _Pragma("clang optimize on") +#elif COMPILER_GCC && !BUILD_DEBUG +# define NO_OPTIMIZE_BEGIN _Pragma("GCC push_options") _Pragma("GCC optimize(\"O0\")") +# define NO_OPTIMIZE_END _Pragma("GCC pop_options") +#else +# define NO_OPTIMIZE_BEGIN +# define NO_OPTIMIZE_END +#endif + //////////////////////////////// //~ rjf: Versions diff --git a/src/base/base_hash.c b/src/base/base_hash.c index 6799d6b2..0ca3f684 100644 --- a/src/base/base_hash.c +++ b/src/base/base_hash.c @@ -4,37 +4,34 @@ //////////////////////////////// //~ rjf: MD5 -#if !defined(MD5_API) -# define MD5_API static -# include "third_party/md5/md5.c" -# include "third_party/md5/md5.h" -#endif +#include "third_party/martins_hash/md5.h" internal MD5 md5_from_data(String8 data) { - MD5_CTX ctx = {0}; - MD5_Init(&ctx); - MD5_Update(&ctx, (void*)data.str, data.size); + md5_ctx ctx = {0}; + md5_init(&ctx); + md5_update(&ctx, (void*)data.str, data.size); MD5 result = {0}; - MD5_Final(result.u8, &ctx); + md5_finish(&ctx, result.u8); return result; } //////////////////////////////// -//~ rjf: SHA1 +//~ rjf: SHA -#include "third_party/tomcrypt_hash/tomcrypt_hash.h" +#include "third_party/martins_hash/sha1.h" +#include "third_party/martins_hash/sha256.h" internal SHA1 sha1_from_data(String8 data) { SHA1 result = {0}; { - SHA1State state = {0}; - sha1_init(&state); - sha1_process(&state, data.str, data.size); - sha1_done(&state, result.u8); + sha1_ctx ctx = {0}; + sha1_init(&ctx); + sha1_update(&ctx, data.str, data.size); + sha1_finish(&ctx, result.u8); } return result; } @@ -44,10 +41,10 @@ sha256_from_data(String8 data) { SHA256 result = {0}; { - SHA256State state = {0}; - sha256_init(&state); - sha256_process(&state, data.str, data.size); - sha256_done(&state, result.u8); + sha256_ctx ctx = {0}; + sha256_init(&ctx); + sha256_update(&ctx, data.str, data.size); + sha256_finish(&ctx, result.u8); } return result; } diff --git a/src/ctrl/ctrl_core.c b/src/ctrl/ctrl_core.c index 5f4cdedf..9bc20a44 100644 --- a/src/ctrl/ctrl_core.c +++ b/src/ctrl/ctrl_core.c @@ -6136,13 +6136,14 @@ internal C_Key ctrl_key_from_process_vaddr_range(CTRL_Handle process, Rng1U64 vaddr_range, B32 zero_terminated, B32 wait_for_fresh, U64 endt_us, B32 *out_is_stale) { ProfBeginFunction(); +#pragma pack(push, 1) struct { CTRL_Handle process; Rng1U64 vaddr_range; B32 zero_terminated; - B32 _padding_; } key_data = {process, vaddr_range, zero_terminated}; +#pragma pack(pop) String8 key = str8_struct(&key_data); Access *access = access_open(); AC_Artifact artifact = ac_artifact_from_key(access, key, ctrl_memory_artifact_create, ctrl_memory_artifact_destroy, endt_us, diff --git a/src/raddbg/raddbg_core.c b/src/raddbg/raddbg_core.c index 03383933..e9ca65a1 100644 --- a/src/raddbg/raddbg_core.c +++ b/src/raddbg/raddbg_core.c @@ -5779,10 +5779,11 @@ rd_store_view_loading_info(B32 is_loading, U64 progress_u64, U64 progress_u64_ta { RD_Cfg *view = rd_cfg_from_id(rd_regs()->view); RD_ViewState *view_state = rd_view_state_from_cfg(view); + B32 loading_state_is_new = (is_loading && view_state->loading_t_target != (F32)!!is_loading); view_state->loading_t_target = (F32)!!is_loading; view_state->loading_progress_v = progress_u64; view_state->loading_progress_v_target = progress_u64_target; - if(view_state->last_frame_index_built+1 < rd_state->frame_index) + if(loading_state_is_new || view_state->last_frame_index_built+1 < rd_state->frame_index) { view_state->loading_t = view_state->loading_t_target; } @@ -5983,7 +5984,7 @@ rd_window_state_from_os_handle(OS_Handle os) } #if COMPILER_MSVC && !BUILD_DEBUG -#pragma optimize("", off) +NO_OPTIMIZE_BEGIN #endif internal void @@ -9954,7 +9955,7 @@ rd_window_frame(void) } #if COMPILER_MSVC && !BUILD_DEBUG -#pragma optimize("", on) +NO_OPTIMIZE_END #endif //////////////////////////////// diff --git a/src/raddbg/raddbg_views.c b/src/raddbg/raddbg_views.c index 431f1e9f..ec71f5b0 100644 --- a/src/raddbg/raddbg_views.c +++ b/src/raddbg/raddbg_views.c @@ -4024,12 +4024,14 @@ RD_VIEW_UI_FUNCTION_DEF(bitmap) for EachIndex(rewind_idx, C_KEY_HASH_HISTORY_COUNT) { U128 hash = c_hash_from_key(texture_key, rewind_idx); +#pragma pack(push, 1) struct { U128 hash; RD_BitmapTopology top; } key_data = {hash, topology}; +#pragma pack(pop) String8 key = str8_struct(&key_data); AC_Artifact artifact = ac_artifact_from_key(access, key, rd_bitmap_artifact_create, rd_bitmap_artifact_destroy, 0); R_Handle texture_candidate = {0}; diff --git a/src/text/text.c b/src/text/text.c index d86062fc..c81d1e08 100644 --- a/src/text/text.c +++ b/src/text/text.c @@ -2250,11 +2250,13 @@ txt_artifact_destroy(AC_Artifact artifact) internal TXT_TextInfo txt_text_info_from_hash_lang(Access *access, U128 hash, TXT_LangKind lang) { +#pragma pack(push, 1) struct { U128 hash; TXT_LangKind lang; } key = {hash, lang}; +#pragma pack(pop) String8 key_string = str8_struct(&key); AC_Artifact artifact = ac_artifact_from_key(access, key_string, txt_artifact_create, txt_artifact_destroy, 0, .flags = AC_Flag_Wide); TXT_Artifact *txt_artifact = (TXT_Artifact *)artifact.u64[0]; diff --git a/src/third_party/martins_hash/md5.h b/src/third_party/martins_hash/md5.h new file mode 100644 index 00000000..54632d0e --- /dev/null +++ b/src/third_party/martins_hash/md5.h @@ -0,0 +1,435 @@ +#pragma once + +// https://www.rfc-editor.org/rfc/rfc1321.html + +#include +#include + +// +// interface +// + +#define MD5_DIGEST_SIZE 16 +#define MD5_BLOCK_SIZE 64 + +typedef struct { + uint8_t buffer[MD5_BLOCK_SIZE]; + uint64_t count; + uint32_t state[4]; +} md5_ctx; + +static inline void md5_init(md5_ctx* ctx); +static inline void md5_update(md5_ctx* ctx, const void* data, size_t size); +static inline void md5_finish(md5_ctx* ctx, uint8_t digest[MD5_DIGEST_SIZE]); + +// +// implementation +// + +#include // memcpy, memset + +#if defined(__clang__) +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wcast-align" +# pragma clang diagnostic ignored "-Wunsafe-buffer-usage" +# pragma clang diagnostic ignored "-Wlanguage-extension-token" +# pragma clang diagnostic ignored "-Wdeclaration-after-statement" +#endif + +#if defined(__clang__) +# define MD5_ROL32(x,n) __builtin_rotateleft32(x, n) +#elif defined(_MSC_VER) +# include +# define MD5_ROL32(x,n) _rotl(x, n) +#else +# define MD5_ROL32(x,n) ( ((x) << (n)) | ((x) >> (32-(n))) ) +#endif + +#if defined(_MSC_VER) +# define MD5_GET32LE(ptr) *((const _UNALIGNED uint32_t*)(ptr)) +# define MD5_SET32LE(ptr,x) *((_UNALIGNED uint32_t*)(ptr)) = (x) +# define MD5_SET64LE(ptr,x) *((_UNALIGNED uint64_t*)(ptr)) = (x) +#else +# define MD5_GET32LE(ptr) \ + ( \ + ((ptr)[0] << 0) | \ + ((ptr)[1] << 8) | \ + ((ptr)[2] << 16) | \ + ((ptr)[3] << 24) \ + ) +# define MD5_SET32LE(ptr, x) do \ + { \ + (ptr)[0] = (uint8_t)((x) >> 0); \ + (ptr)[1] = (uint8_t)((x) >> 8); \ + (ptr)[2] = (uint8_t)((x) >> 16); \ + (ptr)[3] = (uint8_t)((x) >> 24); \ + } \ + while (0) +# define MD5_SET64LE(ptr, x) do \ + { \ + (ptr)[0] = (uint8_t)((x) >> 0); \ + (ptr)[1] = (uint8_t)((x) >> 8); \ + (ptr)[2] = (uint8_t)((x) >> 16); \ + (ptr)[3] = (uint8_t)((x) >> 24); \ + (ptr)[4] = (uint8_t)((x) >> 32); \ + (ptr)[5] = (uint8_t)((x) >> 40); \ + (ptr)[6] = (uint8_t)((x) >> 48); \ + (ptr)[7] = (uint8_t)((x) >> 56); \ + } \ + while (0) +#endif + +// MD5_COMPILER_BARRIER forces clang to do better codegen without spilling registers to stack too much +#if defined(__clang__) || defined(__GNUC__) +# define MD5_COMPILER_BARRIER() __asm__ __volatile__("" : : : "memory") +#else +# define MD5_COMPILER_BARRIER() +#endif + + +#if defined(__x86_64__) || defined(_M_AMD64) + +#if defined(__clang__) || defined(__GNUC__) +# include +# define MD5_TARGET(str) __attribute__((target(str))) +# define MD5_CPUID_EX(x, y, info) __cpuid_count(x, y, info[0], info[1], info[2], info[3]) +# define MD5_ANDN_U32(x,y) (~(x) & (y)) +#else +# include +# define MD5_TARGET(str) +# define MD5_CPUID_EX(x, y, info) __cpuidex(info, x, y) +# define MD5_ANDN_U32(x,y) _andn_u32(x,y) +#endif + +#if defined(__clang__) +# define MD5_RORX_U32(x,n) __builtin_rotateright32(x, n) +#elif defined(_MSC_VER) +# define MD5_RORX_U32(x,n) _rorx_u32(x,n) +#else +# define MD5_RORX_U32(x,n) ( ((x) >> (n)) | ((x) << (32-(n))) ) +#endif + +#define MD5_CPUID_INIT (1 << 0) +#define MD5_CPUID_BMI2 (1 << 1) + +static inline int md5_cpuid(void) +{ + static int cpuid; + + int result = cpuid; + if (result == 0) + { + int info[4]; + + MD5_CPUID_EX(7, 0, info); + int has_bmi = info[1] & (1 << 3); + int has_bmi2 = info[1] & (1 << 8); + + result |= MD5_CPUID_INIT; + if (has_bmi && has_bmi2) + { + result |= MD5_CPUID_BMI2; + } + + cpuid = result; + } + +#if defined(MD5_CPUID_MASK) + result &= MD5_CPUID_MASK; +#endif + + return result; +} + +MD5_TARGET("bmi,bmi2,tune=znver1") +static void md5_process_bmi2(uint32_t* state, const uint8_t* block, size_t count) +{ + // "tune=znver1" allows clang to use LEA with [reg+reg+imm] operand which helps performance on modern CPU's + // -1 in I will get folded together with constant k + + #define F(x,y,z) (x & y) + MD5_ANDN_U32(x, z) + #define G(x,y,z) (x & z) + MD5_ANDN_U32(z, y) + #define H(x,y,z) (x ^ y ^ z) + #define I(x,y,z) 0 - 1 - (y ^ MD5_ANDN_U32(x, z)) + + #define X(i) MD5_GET32LE(block + i*sizeof(uint32_t)) + + #define ROUND(F, a, b, c, d, x, k, r) do { \ + a += (k) + F(b, c, d) + (x); \ + a = MD5_RORX_U32(a, 32-r) + b; \ + } while (0) + + #define QROUND_F(x0, x1, x2, x3, k0, k1, k2, k3) do { \ + ROUND(F, a, b, c, d, X(x0), k0, 7); \ + ROUND(F, d, a, b, c, X(x1), k1, 12); \ + ROUND(F, c, d, a, b, X(x2), k2, 17); \ + ROUND(F, b, c, d, a, X(x3), k3, 22); \ + } while (0) + + #define QROUND_G(x0, x1, x2, x3, k0, k1, k2, k3) do { \ + ROUND(G, a, b, c, d, X(x0), k0, 5); \ + ROUND(G, d, a, b, c, X(x1), k1, 9); \ + ROUND(G, c, d, a, b, X(x2), k2, 14); \ + ROUND(G, b, c, d, a, X(x3), k3, 20); \ + } while (0) + + #define QROUND_H(x0, x1, x2, x3, k0, k1, k2, k3) do { \ + ROUND(H, a, b, c, d, X(x0), k0, 4); \ + ROUND(H, d, a, b, c, X(x1), k1, 11); \ + ROUND(H, c, d, a, b, X(x2), k2, 16); \ + ROUND(H, b, c, d, a, X(x3), k3, 23); \ + } while (0) + + #define QROUND_I(x0, x1, x2, x3, k0, k1, k2, k3) do { \ + ROUND(I, a, b, c, d, X(x0), k0, 6); \ + ROUND(I, d, a, b, c, X(x1), k1, 10); \ + ROUND(I, c, d, a, b, X(x2), k2, 15); \ + ROUND(I, b, c, d, a, X(x3), k3, 21); \ + } while (0) + + uint32_t a = state[0]; + uint32_t b = state[1]; + uint32_t c = state[2]; + uint32_t d = state[3]; + + do + { + uint32_t last_a = a; + uint32_t last_b = b; + uint32_t last_c = c; + uint32_t last_d = d; + + QROUND_F( 0, 1, 2, 3, 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee); + QROUND_F( 4, 5, 6, 7, 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501); + QROUND_F( 8, 9, 10, 11, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be); + QROUND_F(12, 13, 14, 15, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821); + MD5_COMPILER_BARRIER(); + + QROUND_G( 1, 6, 11, 0, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa); + QROUND_G( 5, 10, 15, 4, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8); + QROUND_G( 9, 14, 3, 8, 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed); + QROUND_G(13, 2, 7, 12, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a); + MD5_COMPILER_BARRIER(); + + QROUND_H( 5, 8, 11, 14, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c); + QROUND_H( 1, 4, 7, 10, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70); + QROUND_H(13, 0, 3, 6, 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05); + QROUND_H( 9, 12, 15, 2, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665); + MD5_COMPILER_BARRIER(); + + QROUND_I( 0, 7, 14, 5, 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039); + QROUND_I(12, 3, 10, 1, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1); + QROUND_I( 8, 15, 6, 13, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1); + QROUND_I( 4, 11, 2, 9, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391); + MD5_COMPILER_BARRIER(); + + a += last_a; + b += last_b; + c += last_c; + d += last_d; + + block += MD5_BLOCK_SIZE; + } + while (--count); + + state[0] = a; + state[1] = b; + state[2] = c; + state[3] = d; + + #undef QROUND_F + #undef QROUND_G + #undef QROUND_H + #undef QROUND_I + #undef ROUND + #undef X + #undef F + #undef G + #undef H + #undef I +} + +#endif // defined(__x86_64__) || defined(_M_AMD64) + +static void md5_process(uint32_t* state, const uint8_t* block, size_t count) +{ +#if defined(__x86_64__) || defined(_M_AMD64) + int cpuid = md5_cpuid(); + if (cpuid & MD5_CPUID_BMI2) + { + md5_process_bmi2(state, block, count); + return; + } +#endif + + // F function uses 3 operations instead of 4 when "bit select" instruction is not available + // (x & y) | (~x & z) == (z ^ (x & (y ^ z)) + + // G function uses + instead of | for better ILP + + // #define F(x,y,z) ((x & y) | (~x & z)) + #define F(x,y,z) (z ^ (x & (y ^ z))) + #define G(x,y,z) (x & z) + (y & ~z) + #define H(x,y,z) (x ^ y ^ z) + #define I(x,y,z) (y ^ (x | ~z)) + + #define X(i) MD5_GET32LE(block + i*sizeof(uint32_t)) + + #define ROUND(F, a, b, c, d, x, k, r) do { \ + a += F(b, c, d) + (x) + (k); \ + a = MD5_ROL32(a, r) + b; \ + } while (0) + + #define QROUND_F(x0, x1, x2, x3, k0, k1, k2, k3) do { \ + ROUND(F, a, b, c, d, X(x0), k0, 7); \ + ROUND(F, d, a, b, c, X(x1), k1, 12); \ + ROUND(F, c, d, a, b, X(x2), k2, 17); \ + ROUND(F, b, c, d, a, X(x3), k3, 22); \ + } while (0) + + #define QROUND_G(x0, x1, x2, x3, k0, k1, k2, k3) do { \ + ROUND(G, a, b, c, d, X(x0), k0, 5); \ + ROUND(G, d, a, b, c, X(x1), k1, 9); \ + ROUND(G, c, d, a, b, X(x2), k2, 14); \ + ROUND(G, b, c, d, a, X(x3), k3, 20); \ + } while (0) + + #define QROUND_H(x0, x1, x2, x3, k0, k1, k2, k3) do { \ + ROUND(H, a, b, c, d, X(x0), k0, 4); \ + ROUND(H, d, a, b, c, X(x1), k1, 11); \ + ROUND(H, c, d, a, b, X(x2), k2, 16); \ + ROUND(H, b, c, d, a, X(x3), k3, 23); \ + } while (0) + + #define QROUND_I(x0, x1, x2, x3, k0, k1, k2, k3) do { \ + ROUND(I, a, b, c, d, X(x0), k0, 6); \ + ROUND(I, d, a, b, c, X(x1), k1, 10); \ + ROUND(I, c, d, a, b, X(x2), k2, 15); \ + ROUND(I, b, c, d, a, X(x3), k3, 21); \ + } while (0) + + uint32_t a = state[0]; + uint32_t b = state[1]; + uint32_t c = state[2]; + uint32_t d = state[3]; + + do + { + uint32_t last_a = a; + uint32_t last_b = b; + uint32_t last_c = c; + uint32_t last_d = d; + + QROUND_F( 0, 1, 2, 3, 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee); + QROUND_F( 4, 5, 6, 7, 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501); + QROUND_F( 8, 9, 10, 11, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be); + QROUND_F(12, 13, 14, 15, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821); + MD5_COMPILER_BARRIER(); + + QROUND_G( 1, 6, 11, 0, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa); + QROUND_G( 5, 10, 15, 4, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8); + QROUND_G( 9, 14, 3, 8, 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed); + QROUND_G(13, 2, 7, 12, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a); + MD5_COMPILER_BARRIER(); + + QROUND_H( 5, 8, 11, 14, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c); + QROUND_H( 1, 4, 7, 10, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70); + QROUND_H(13, 0, 3, 6, 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05); + QROUND_H( 9, 12, 15, 2, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665); + MD5_COMPILER_BARRIER(); + + QROUND_I( 0, 7, 14, 5, 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039); + QROUND_I(12, 3, 10, 1, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1); + QROUND_I( 8, 15, 6, 13, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1); + QROUND_I( 4, 11, 2, 9, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391); + MD5_COMPILER_BARRIER(); + + a += last_a; + b += last_b; + c += last_c; + d += last_d; + + block += MD5_BLOCK_SIZE; + } + while (--count); + + state[0] = a; + state[1] = b; + state[2] = c; + state[3] = d; + + #undef QROUND_F + #undef QROUND_G + #undef QROUND_H + #undef QROUND_I + #undef ROUND + #undef X + #undef F + #undef G + #undef H + #undef I +} + +void md5_init(md5_ctx* ctx) +{ + ctx->count = 0; + ctx->state[0] = 0x67452301; + ctx->state[1] = 0xefcdab89; + ctx->state[2] = 0x98badcfe; + ctx->state[3] = 0x10325476; +} + +void md5_update(md5_ctx* ctx, const void* data, size_t size) +{ + const uint8_t* buffer = (const uint8_t*)data; + + size_t pending = ctx->count % MD5_BLOCK_SIZE; + ctx->count += size; + + size_t available = MD5_BLOCK_SIZE - pending; + if (pending && size >= available) + { + memcpy(ctx->buffer + pending, buffer, available); + md5_process(ctx->state, ctx->buffer, 1); + buffer += available; + size -= available; + pending = 0; + } + + size_t count = size / MD5_BLOCK_SIZE; + if (count) + { + md5_process(ctx->state, buffer, count); + buffer += count * MD5_BLOCK_SIZE; + size -= count * MD5_BLOCK_SIZE; + } + + memcpy(ctx->buffer + pending, buffer, size); +} + +void md5_finish(md5_ctx* ctx, uint8_t digest[MD5_DIGEST_SIZE]) +{ + uint64_t count = ctx->count; + uint64_t bitcount = count * 8; + + size_t pending = count % MD5_BLOCK_SIZE; + size_t blocks = pending < MD5_BLOCK_SIZE - sizeof(bitcount) ? 1 : 2; + + ctx->buffer[pending++] = 0x80; + + uint8_t padding[2 * MD5_BLOCK_SIZE]; + memcpy(padding, ctx->buffer, MD5_BLOCK_SIZE); + memset(padding + pending, 0, MD5_BLOCK_SIZE); + MD5_SET64LE(padding + blocks * MD5_BLOCK_SIZE - sizeof(bitcount), bitcount); + + md5_process(ctx->state, padding, blocks); + + for (size_t i=0; i<4; i++) + { + MD5_SET32LE(digest + i*sizeof(uint32_t), ctx->state[i]); + } +} + +#if defined(__clang__) +#pragma clang diagnostic pop +#endif diff --git a/src/third_party/martins_hash/sha1.h b/src/third_party/martins_hash/sha1.h new file mode 100644 index 00000000..043ac986 --- /dev/null +++ b/src/third_party/martins_hash/sha1.h @@ -0,0 +1,441 @@ +#pragma once + +// https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf + +#include +#include + +// +// interface +// + +#define SHA1_DIGEST_SIZE 20 +#define SHA1_BLOCK_SIZE 64 + +typedef struct { + uint8_t buffer[SHA1_BLOCK_SIZE]; + uint64_t count; + uint32_t state[5]; +} sha1_ctx; + +static inline void sha1_init(sha1_ctx* ctx); +static inline void sha1_update(sha1_ctx* ctx, const void* data, size_t size); +static inline void sha1_finish(sha1_ctx* ctx, uint8_t digest[SHA1_DIGEST_SIZE]); + +// +// implementation +// + +#include // memcpy, memset + +#if defined(__clang__) +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wcast-align" +# pragma clang diagnostic ignored "-Wunsafe-buffer-usage" +# pragma clang diagnostic ignored "-Wlanguage-extension-token" +# pragma clang diagnostic ignored "-Wdeclaration-after-statement" +#elif defined(_MSC_VER) +# pragma warning (push) +# pragma warning (disable : 4127) +#endif + +#if defined(__clang__) +# define SHA1_ROL32(x,n) __builtin_rotateleft32(x, n) +#elif defined(_MSC_VER) +# include +# define SHA1_ROL32(x,n) _rotl(x, n) +#else +# define SHA1_ROL32(x,n) ( ((x) << (n)) | ((x) >> (32-(n))) ) +#endif + +#if defined(_MSC_VER) +# include +# define SHA1_GET32BE(ptr) _byteswap_ulong( *((const _UNALIGNED uint32_t*)(ptr)) ) +# define SHA1_SET32BE(ptr,x) *((_UNALIGNED uint32_t*)(ptr)) = _byteswap_ulong(x) +# define SHA1_SET64BE(ptr,x) *((_UNALIGNED uint64_t*)(ptr)) = _byteswap_uint64(x) +#else +# define SHA1_GET32BE(ptr) \ + ( \ + ((ptr)[0] << 24) | \ + ((ptr)[1] << 16) | \ + ((ptr)[2] << 8) | \ + ((ptr)[3] << 0) \ + ) +# define SHA1_SET32BE(ptr, x) do \ + { \ + (ptr)[0] = (uint8_t)((x) >> 24); \ + (ptr)[1] = (uint8_t)((x) >> 16); \ + (ptr)[2] = (uint8_t)((x) >> 8); \ + (ptr)[3] = (uint8_t)((x) >> 0); \ + } \ + while (0) +# define SHA1_SET64BE(ptr, x) do \ + { \ + (ptr)[0] = (uint8_t)((x) >> 56); \ + (ptr)[1] = (uint8_t)((x) >> 48); \ + (ptr)[2] = (uint8_t)((x) >> 40); \ + (ptr)[3] = (uint8_t)((x) >> 32); \ + (ptr)[4] = (uint8_t)((x) >> 24); \ + (ptr)[5] = (uint8_t)((x) >> 16); \ + (ptr)[6] = (uint8_t)((x) >> 8); \ + (ptr)[7] = (uint8_t)((x) >> 0); \ + } \ + while (0) +#endif + +#if defined(__x86_64__) || defined(_M_AMD64) + +#include // SSSE3 +#include // SHANI + +#if defined(__clang__) || defined(__GNUC__) +# include +# define SHA1_TARGET(str) __attribute__((target(str))) +# define SHA1_CPUID(x, info) __cpuid(x, info[0], info[1], info[2], info[3]) +# define SHA1_CPUID_EX(x, y, info) __cpuid_count(x, y, info[0], info[1], info[2], info[3]) +#else +# include +# define SHA1_TARGET(str) +# define SHA1_CPUID(x, info) __cpuid(info, x) +# define SHA1_CPUID_EX(x, y, info) __cpuidex(info, x, y) +#endif + +#define SHA1_CPUID_INIT (1 << 0) +#define SHA1_CPUID_SHANI (1 << 1) + +static inline int sha1_cpuid(void) +{ + static int cpuid; + + int result = cpuid; + if (result == 0) + { + int info[4]; + + SHA1_CPUID(1, info); + int has_ssse3 = info[3] & (1 << 9); + + SHA1_CPUID_EX(7, 0, info); + int has_shani = info[1] & (1 << 29); + + result |= SHA1_CPUID_INIT; + if (has_ssse3 && has_shani) + { + result |= SHA1_CPUID_SHANI; + } + + cpuid = result; + } + +#if defined(SHA1_CPUID_MASK) + result &= SHA1_CPUID_MASK; +#endif + + return result; +} + +SHA1_TARGET("ssse3,sha") +static void sha1_process_shani(uint32_t* state, const uint8_t* block, size_t count) +{ + const __m128i* buffer = (const __m128i*)block; + + // for performing two operations in one: + // 1) dwords need to be loaded as big-endian + // 2) order of dwords need to be reversed for sha instructions: [0,1,2,3] -> [3,2,1,0] + const __m128i bswap = _mm_setr_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); + + #define W(i) w[(i)%4] + + // 4 wide round calculations + #define QROUND(i) do { \ + /* first four rounds loads input message */ \ + if (i < 4) W(i) = _mm_shuffle_epi8(_mm_loadu_si128(&buffer[i]), bswap); \ + /* update previous message dwords for next rounds */ \ + if (i > 0 && i < 17) W(i-1) = _mm_sha1msg1_epu32(W(i-1), W(i)); \ + if (i > 1 && i < 18) W(i-2) = _mm_xor_si128(W(i-2), W(i)); \ + if (i > 2 && i < 19) W(i-3) = _mm_sha1msg2_epu32(W(i-3), W(i)); \ + /* calculate E from message dwords */ \ + if (i == 0) tmp = _mm_add_epi32(e0, W(i)); \ + if (i != 0) tmp = _mm_sha1nexte_epu32(e0, W(i)); \ + /* round function */ \ + e0 = abcd; \ + abcd = _mm_sha1rnds4_epu32(abcd, tmp, (i/5)%4); \ + } while(0) + + // load initial state + __m128i abcd = _mm_loadu_si128((const __m128i*)state); // [d,c,b,a] + __m128i e0 = _mm_loadu_si32(&state[4]); // [0,0,0,e] + + // change dword order + abcd = _mm_shuffle_epi32(abcd, _MM_SHUFFLE(0,1,2,3)); // [a,b,c,d] where a is in the top lane + e0 = _mm_slli_si128(e0, 12); // [e,0,0,0] where e is in top lane + + do + { + // remember current state + __m128i last_abcd = abcd; + __m128i last_e0 = e0; + + __m128i tmp, w[4]; + + QROUND(0); + QROUND(1); + QROUND(2); + QROUND(3); + QROUND(4); + QROUND(5); + QROUND(6); + QROUND(7); + QROUND(8); + QROUND(9); + QROUND(10); + QROUND(11); + QROUND(12); + QROUND(13); + QROUND(14); + QROUND(15); + QROUND(16); + QROUND(17); + QROUND(18); + QROUND(19); + + // update next state + abcd = _mm_add_epi32(abcd, last_abcd); + e0 = _mm_sha1nexte_epu32(e0, last_e0); + + buffer += 4; + } + while (--count); + + // restore dword order + abcd = _mm_shuffle_epi32(abcd, _MM_SHUFFLE(0,1,2,3)); + e0 = _mm_shuffle_epi32(e0, _MM_SHUFFLE(0,1,2,3)); + + // save the new state + _mm_storeu_si128((__m128i*)state, abcd); + _mm_storeu_si32(&state[4], e0); + + #undef QROUND + #undef W +} + +#endif // defined(__x86_64__) || defined(_M_AMD64) + +static void sha1_process(uint32_t* state, const uint8_t* block, size_t count) +{ +#if defined(__x86_64__) || defined(_M_AMD64) + int cpuid = sha1_cpuid(); + if (cpuid & SHA1_CPUID_SHANI) + { + sha1_process_shani(state, block, count); + return; + } +#endif + + #define F1(x,y,z) (0x5a827999 + ((x & (y ^ z)) ^ z)) + #define F2(x,y,z) (0x6ed9eba1 + (x ^ y ^ z)) + #define F3(x,y,z) (0x8f1bbcdc + ((x & y) | (z & (x | y)))) + #define F4(x,y,z) (0xca62c1d6 + (x ^ y ^ z)) + + #define W(i) w[(i+16)%16] + + #define ROUND(i,a,b,c,d,e,F) do \ + { \ + uint32_t w0; \ + if (i < 16) W(i) = w0 = SHA1_GET32BE(block + i*sizeof(uint32_t)); \ + if (i >= 16) W(i) = w0 = SHA1_ROL32(W(i-3) ^ W(i-8) ^ W(i-14) ^ W(i-16), 1); \ + \ + e += SHA1_ROL32(a,5) + F(b,c,d) + w0; \ + b = SHA1_ROL32(b,30); \ + } while (0) + + uint32_t a = state[0]; + uint32_t b = state[1]; + uint32_t c = state[2]; + uint32_t d = state[3]; + uint32_t e = state[4]; + + do + { + uint32_t last_a = a; + uint32_t last_b = b; + uint32_t last_c = c; + uint32_t last_d = d; + uint32_t last_e = e; + + uint32_t w[16]; + + ROUND( 0, a, b, c, d, e, F1); + ROUND( 1, e, a, b, c, d, F1); + ROUND( 2, d, e, a, b, c, F1); + ROUND( 3, c, d, e, a, b, F1); + ROUND( 4, b, c, d, e, a, F1); + ROUND( 5, a, b, c, d, e, F1); + ROUND( 6, e, a, b, c, d, F1); + ROUND( 7, d, e, a, b, c, F1); + ROUND( 8, c, d, e, a, b, F1); + ROUND( 9, b, c, d, e, a, F1); + ROUND(10, a, b, c, d, e, F1); + ROUND(11, e, a, b, c, d, F1); + ROUND(12, d, e, a, b, c, F1); + ROUND(13, c, d, e, a, b, F1); + ROUND(14, b, c, d, e, a, F1); + ROUND(15, a, b, c, d, e, F1); + ROUND(16, e, a, b, c, d, F1); + ROUND(17, d, e, a, b, c, F1); + ROUND(18, c, d, e, a, b, F1); + ROUND(19, b, c, d, e, a, F1); + + ROUND(20, a, b, c, d, e, F2); + ROUND(21, e, a, b, c, d, F2); + ROUND(22, d, e, a, b, c, F2); + ROUND(23, c, d, e, a, b, F2); + ROUND(24, b, c, d, e, a, F2); + ROUND(25, a, b, c, d, e, F2); + ROUND(26, e, a, b, c, d, F2); + ROUND(27, d, e, a, b, c, F2); + ROUND(28, c, d, e, a, b, F2); + ROUND(29, b, c, d, e, a, F2); + ROUND(30, a, b, c, d, e, F2); + ROUND(31, e, a, b, c, d, F2); + ROUND(32, d, e, a, b, c, F2); + ROUND(33, c, d, e, a, b, F2); + ROUND(34, b, c, d, e, a, F2); + ROUND(35, a, b, c, d, e, F2); + ROUND(36, e, a, b, c, d, F2); + ROUND(37, d, e, a, b, c, F2); + ROUND(38, c, d, e, a, b, F2); + ROUND(39, b, c, d, e, a, F2); + + ROUND(40, a, b, c, d, e, F3); + ROUND(41, e, a, b, c, d, F3); + ROUND(42, d, e, a, b, c, F3); + ROUND(43, c, d, e, a, b, F3); + ROUND(44, b, c, d, e, a, F3); + ROUND(45, a, b, c, d, e, F3); + ROUND(46, e, a, b, c, d, F3); + ROUND(47, d, e, a, b, c, F3); + ROUND(48, c, d, e, a, b, F3); + ROUND(49, b, c, d, e, a, F3); + ROUND(50, a, b, c, d, e, F3); + ROUND(51, e, a, b, c, d, F3); + ROUND(52, d, e, a, b, c, F3); + ROUND(53, c, d, e, a, b, F3); + ROUND(54, b, c, d, e, a, F3); + ROUND(55, a, b, c, d, e, F3); + ROUND(56, e, a, b, c, d, F3); + ROUND(57, d, e, a, b, c, F3); + ROUND(58, c, d, e, a, b, F3); + ROUND(59, b, c, d, e, a, F3); + + ROUND(60, a, b, c, d, e, F4); + ROUND(61, e, a, b, c, d, F4); + ROUND(62, d, e, a, b, c, F4); + ROUND(63, c, d, e, a, b, F4); + ROUND(64, b, c, d, e, a, F4); + ROUND(65, a, b, c, d, e, F4); + ROUND(66, e, a, b, c, d, F4); + ROUND(67, d, e, a, b, c, F4); + ROUND(68, c, d, e, a, b, F4); + ROUND(69, b, c, d, e, a, F4); + ROUND(70, a, b, c, d, e, F4); + ROUND(71, e, a, b, c, d, F4); + ROUND(72, d, e, a, b, c, F4); + ROUND(73, c, d, e, a, b, F4); + ROUND(74, b, c, d, e, a, F4); + ROUND(75, a, b, c, d, e, F4); + ROUND(76, e, a, b, c, d, F4); + ROUND(77, d, e, a, b, c, F4); + ROUND(78, c, d, e, a, b, F4); + ROUND(79, b, c, d, e, a, F4); + + a += last_a; + b += last_b; + c += last_c; + d += last_d; + e += last_e; + + block += SHA1_BLOCK_SIZE; + } + while (--count); + + state[0] = a; + state[1] = b; + state[2] = c; + state[3] = d; + state[4] = e; + + #undef ROUND + #undef W + #undef F1 + #undef F2 + #undef F3 + #undef F4 +} + +void sha1_init(sha1_ctx* ctx) +{ + ctx->count = 0; + ctx->state[0] = 0x67452301; + ctx->state[1] = 0xefcdab89; + ctx->state[2] = 0x98badcfe; + ctx->state[3] = 0x10325476; + ctx->state[4] = 0xc3d2e1f0; +} + +void sha1_update(sha1_ctx* ctx, const void* data, size_t size) +{ + const uint8_t* buffer = (const uint8_t*)data; + + size_t pending = ctx->count % SHA1_BLOCK_SIZE; + ctx->count += size; + + size_t available = SHA1_BLOCK_SIZE - pending; + if (pending && size >= available) + { + memcpy(ctx->buffer + pending, buffer, available); + sha1_process(ctx->state, ctx->buffer, 1); + buffer += available; + size -= available; + pending = 0; + } + + size_t count = size / SHA1_BLOCK_SIZE; + if (count) + { + sha1_process(ctx->state, buffer, count); + buffer += count * SHA1_BLOCK_SIZE; + size -= count * SHA1_BLOCK_SIZE; + } + + memcpy(ctx->buffer + pending, buffer, size); +} + +void sha1_finish(sha1_ctx* ctx, uint8_t digest[SHA1_DIGEST_SIZE]) +{ + uint64_t count = ctx->count; + uint64_t bitcount = count * 8; + + size_t pending = count % SHA1_BLOCK_SIZE; + size_t blocks = pending < SHA1_BLOCK_SIZE - sizeof(bitcount) ? 1 : 2; + + ctx->buffer[pending++] = 0x80; + + uint8_t padding[2 * SHA1_BLOCK_SIZE]; + memcpy(padding, ctx->buffer, SHA1_BLOCK_SIZE); + memset(padding + pending, 0, SHA1_BLOCK_SIZE); + SHA1_SET64BE(padding + blocks * SHA1_BLOCK_SIZE - sizeof(bitcount), bitcount); + + sha1_process(ctx->state, padding, blocks); + + for (size_t i=0; i<5; i++) + { + SHA1_SET32BE(digest + i*sizeof(uint32_t), ctx->state[i]); + } +} + +#if defined(__clang__) +# pragma clang diagnostic pop +#elif defined(_MSC_VER) +# pragma warning (pop) +#endif diff --git a/src/third_party/martins_hash/sha256.h b/src/third_party/martins_hash/sha256.h new file mode 100644 index 00000000..72a0fad6 --- /dev/null +++ b/src/third_party/martins_hash/sha256.h @@ -0,0 +1,472 @@ +#pragma once + +// https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf +// https://www.rfc-editor.org/rfc/rfc6234 + +#include +#include + +// +// interface +// + +#define SHA224_DIGEST_SIZE 28 +#define SHA256_DIGEST_SIZE 32 +#define SHA256_BLOCK_SIZE 64 + +typedef struct { + uint8_t buffer[SHA256_BLOCK_SIZE]; + uint64_t count; + uint32_t state[8]; +} sha256_ctx; + +typedef sha256_ctx sha224_ctx; + +static inline void sha256_init(sha256_ctx* ctx); +static inline void sha256_update(sha256_ctx* ctx, const void* data, size_t size); +static inline void sha256_finish(sha256_ctx* ctx, uint8_t digest[SHA256_DIGEST_SIZE]); + +static inline void sha224_init(sha224_ctx* ctx); +static inline void sha224_update(sha224_ctx* ctx, const void* data, size_t size); +static inline void sha224_finish(sha224_ctx* ctx, uint8_t digest[SHA224_DIGEST_SIZE]); + +// +// implementation +// + +#include // memcpy, memset + +#if defined(__clang__) +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wcast-align" +# pragma clang diagnostic ignored "-Wunsafe-buffer-usage" +# pragma clang diagnostic ignored "-Wlanguage-extension-token" +# pragma clang diagnostic ignored "-Wdeclaration-after-statement" +#elif defined(_MSC_VER) +# pragma warning (push) +# pragma warning (disable : 4127) +#endif + +#if defined(__clang__) +# define SHA256_ROR32(x,n) __builtin_rotateright32(x, n) +#elif defined(_MSC_VER) +# include +# define SHA256_ROR32(x,n) _rotr(x, n) +#else +# define SHA256_ROR32(x,n) ( ((x) >> (n)) | ((x) << (32-(n))) ) +#endif + +#if defined(_MSC_VER) +# include +# define SHA256_GET32BE(ptr) _byteswap_ulong( *((const _UNALIGNED uint32_t*)(ptr)) ) +# define SHA256_SET32BE(ptr,x) *((_UNALIGNED uint32_t*)(ptr)) = _byteswap_ulong(x) +# define SHA256_SET64BE(ptr,x) *((_UNALIGNED uint64_t*)(ptr)) = _byteswap_uint64(x) +#else +# define SHA256_GET32BE(ptr) \ + ( \ + ((ptr)[0] << 24) | \ + ((ptr)[1] << 16) | \ + ((ptr)[2] << 8) | \ + ((ptr)[3] << 0) \ + ) +# define SHA256_SET32BE(ptr, x) do \ + { \ + (ptr)[0] = (uint8_t)((x) >> 24); \ + (ptr)[1] = (uint8_t)((x) >> 16); \ + (ptr)[2] = (uint8_t)((x) >> 8); \ + (ptr)[3] = (uint8_t)((x) >> 0); \ + } \ + while (0) +# define SHA256_SET64BE(ptr, x) do \ + { \ + (ptr)[0] = (uint8_t)((x) >> 56); \ + (ptr)[1] = (uint8_t)((x) >> 48); \ + (ptr)[2] = (uint8_t)((x) >> 40); \ + (ptr)[3] = (uint8_t)((x) >> 32); \ + (ptr)[4] = (uint8_t)((x) >> 24); \ + (ptr)[5] = (uint8_t)((x) >> 16); \ + (ptr)[6] = (uint8_t)((x) >> 8); \ + (ptr)[7] = (uint8_t)((x) >> 0); \ + } \ + while (0) +#endif + +#if defined(__x86_64__) || defined(_M_AMD64) + +#include // SSSE3 +#include // SHANI + +#if defined(__clang__) || defined(__GNUC__) +# include +# define SHA256_TARGET(str) __attribute__((target(str))) +# define SHA256_CPUID(x, info) __cpuid(x, info[0], info[1], info[2], info[3]) +# define SHA256_CPUID_EX(x, y, info) __cpuid_count(x, y, info[0], info[1], info[2], info[3]) +#else +# include +# define SHA256_TARGET(str) +# define SHA256_CPUID(x, info) __cpuid(info, x) +# define SHA256_CPUID_EX(x, y, info) __cpuidex(info, x, y) +#endif + +#define SHA256_CPUID_INIT (1 << 0) +#define SHA256_CPUID_SHANI (1 << 1) + +static inline int sha256_cpuid(void) +{ + static int cpuid; + + int result = cpuid; + if (result == 0) + { + int info[4]; + + SHA256_CPUID(1, info); + int has_ssse3 = info[3] & (1 << 9); + + SHA256_CPUID_EX(7, 0, info); + int has_shani = info[1] & (1 << 29); + + result |= SHA256_CPUID_INIT; + if (has_ssse3 && has_shani) + { + result |= SHA256_CPUID_SHANI; + } + + cpuid = result; + } + +#if defined(SHA256_CPUID_MASK) + result &= SHA256_CPUID_MASK; +#endif + + return result; +} + +SHA256_TARGET("ssse3,sha") +static void sha256_process_shani(uint32_t* state, const uint8_t* block, size_t count) +{ + const __m128i* buffer = (const __m128i*)block; + + // to byteswap when doing big-ending load for message dwords + const __m128i bswap = _mm_setr_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12); + + static const uint32_t K[16][4] = + { + { 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 }, + { 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 }, + { 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 }, + { 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 }, + { 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc }, + { 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da }, + { 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 }, + { 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 }, + { 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 }, + { 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 }, + { 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 }, + { 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 }, + { 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 }, + { 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 }, + { 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 }, + { 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 }, + }; + + #define W(i) w[(i)%4] + + // 4 wide round calculations + #define QROUND(i) do { \ + /* first four rounds loads input message */ \ + if (i < 4) W(i) = _mm_shuffle_epi8(_mm_loadu_si128(&buffer[i]), bswap); \ + /* add round constant */ \ + tmp = _mm_add_epi32(W(i), _mm_loadu_si128((const __m128i*)K[i])); \ + /* update previous message dwords for next rounds */ \ + if (i > 2 && i < 15) W(i-3) = _mm_sha256msg2_epu32(_mm_add_epi32(W(i-3), _mm_alignr_epi8(W(i), W(i-1), 4)), W(i)); \ + if (i > 0 && i < 13) W(i-1) = _mm_sha256msg1_epu32(W(i-1), W(i)); \ + /* round functions */ \ + state1 = _mm_sha256rnds2_epu32(state1, state0, tmp); \ + state0 = _mm_sha256rnds2_epu32(state0, state1, _mm_shuffle_epi32(tmp, _MM_SHUFFLE(0,0,3,2))); \ + } while(0) + + // load initial state + __m128i abcd = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)&state[0]), _MM_SHUFFLE(0,1,2,3)); // [a,b,c,d] + __m128i efgh = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)&state[4]), _MM_SHUFFLE(0,1,2,3)); // [e,f,g,h] + + // dword order for sha256rnds2 instruction + __m128i state0 = _mm_unpackhi_epi64(efgh, abcd); // [a,b,e,f] + __m128i state1 = _mm_unpacklo_epi64(efgh, abcd); // [c,d,g,h] + + do + { + // remember current state + __m128i last0 = state0; + __m128i last1 = state1; + + __m128i tmp, w[4]; + + QROUND(0); + QROUND(1); + QROUND(2); + QROUND(3); + QROUND(4); + QROUND(5); + QROUND(6); + QROUND(7); + QROUND(8); + QROUND(9); + QROUND(10); + QROUND(11); + QROUND(12); + QROUND(13); + QROUND(14); + QROUND(15); + + // update next state + state0 = _mm_add_epi32(state0, last0); + state1 = _mm_add_epi32(state1, last1); + + buffer += 4; + } + while (--count); + + // restore dword order + abcd = _mm_unpackhi_epi64(state1, state0); + efgh = _mm_unpacklo_epi64(state1, state0); + + // save the new state + _mm_storeu_si128((__m128i*)&state[0], _mm_shuffle_epi32(abcd, _MM_SHUFFLE(0,1,2,3))); + _mm_storeu_si128((__m128i*)&state[4], _mm_shuffle_epi32(efgh, _MM_SHUFFLE(0,1,2,3))); + + #undef QROUND + #undef W +} + +#endif // defined(__x86_64__) || defined(_M_AMD64) + +static void sha256_process(uint32_t* state, const uint8_t* block, size_t count) +{ +#if defined(__x86_64__) || defined(_M_AMD64) + int cpuid = sha256_cpuid(); + if (cpuid & SHA256_CPUID_SHANI) + { + sha256_process_shani(state, block, count); + return; + } +#endif + + #define Ch(x,y,z) ((x & (y ^ z)) ^ z) + #define Maj(x,y,z) ((x & y) | (z & (x | y))) + + #define BSig0(x) (SHA256_ROR32(x, 2) ^ SHA256_ROR32(x, 13) ^ SHA256_ROR32(x, 22)) + #define BSig1(x) (SHA256_ROR32(x, 6) ^ SHA256_ROR32(x, 11) ^ SHA256_ROR32(x, 25)) + #define SSig0(x) (SHA256_ROR32(x, 7) ^ SHA256_ROR32(x, 18) ^ (x >> 3)) + #define SSig1(x) (SHA256_ROR32(x, 17) ^ SHA256_ROR32(x, 19) ^ (x >> 10)) + + #define W(i) w[(i+16)%16] + + #define ROUND(i,a,b,c,d,e,f,g,h,K) do \ + { \ + uint32_t w0; \ + if (i < 16) W(i) = w0 = SHA256_GET32BE(block + i*sizeof(uint32_t)); \ + if (i >= 16) W(i) = w0 = SSig1(W(i-2)) + W(i-7) + SSig0(W(i-15)) + W(i-16); \ + \ + uint32_t t1 = h + BSig1(e) + Ch(e,f,g) + K + w0; \ + uint32_t t2 = BSig0(a) + Maj(a,b,c); \ + d += t1; \ + h = t1 + t2; \ + } while (0) + + do + { + uint32_t a = state[0]; + uint32_t b = state[1]; + uint32_t c = state[2]; + uint32_t d = state[3]; + uint32_t e = state[4]; + uint32_t f = state[5]; + uint32_t g = state[6]; + uint32_t h = state[7]; + + uint32_t w[16]; + + ROUND( 0, a, b, c, d, e, f, g, h, 0x428a2f98); + ROUND( 1, h, a, b, c, d, e, f, g, 0x71374491); + ROUND( 2, g, h, a, b, c, d, e, f, 0xb5c0fbcf); + ROUND( 3, f, g, h, a, b, c, d, e, 0xe9b5dba5); + ROUND( 4, e, f, g, h, a, b, c, d, 0x3956c25b); + ROUND( 5, d, e, f, g, h, a, b, c, 0x59f111f1); + ROUND( 6, c, d, e, f, g, h, a, b, 0x923f82a4); + ROUND( 7, b, c, d, e, f, g, h, a, 0xab1c5ed5); + ROUND( 8, a, b, c, d, e, f, g, h, 0xd807aa98); + ROUND( 9, h, a, b, c, d, e, f, g, 0x12835b01); + ROUND(10, g, h, a, b, c, d, e, f, 0x243185be); + ROUND(11, f, g, h, a, b, c, d, e, 0x550c7dc3); + ROUND(12, e, f, g, h, a, b, c, d, 0x72be5d74); + ROUND(13, d, e, f, g, h, a, b, c, 0x80deb1fe); + ROUND(14, c, d, e, f, g, h, a, b, 0x9bdc06a7); + ROUND(15, b, c, d, e, f, g, h, a, 0xc19bf174); + ROUND(16, a, b, c, d, e, f, g, h, 0xe49b69c1); + ROUND(17, h, a, b, c, d, e, f, g, 0xefbe4786); + ROUND(18, g, h, a, b, c, d, e, f, 0x0fc19dc6); + ROUND(19, f, g, h, a, b, c, d, e, 0x240ca1cc); + ROUND(20, e, f, g, h, a, b, c, d, 0x2de92c6f); + ROUND(21, d, e, f, g, h, a, b, c, 0x4a7484aa); + ROUND(22, c, d, e, f, g, h, a, b, 0x5cb0a9dc); + ROUND(23, b, c, d, e, f, g, h, a, 0x76f988da); + ROUND(24, a, b, c, d, e, f, g, h, 0x983e5152); + ROUND(25, h, a, b, c, d, e, f, g, 0xa831c66d); + ROUND(26, g, h, a, b, c, d, e, f, 0xb00327c8); + ROUND(27, f, g, h, a, b, c, d, e, 0xbf597fc7); + ROUND(28, e, f, g, h, a, b, c, d, 0xc6e00bf3); + ROUND(29, d, e, f, g, h, a, b, c, 0xd5a79147); + ROUND(30, c, d, e, f, g, h, a, b, 0x06ca6351); + ROUND(31, b, c, d, e, f, g, h, a, 0x14292967); + ROUND(32, a, b, c, d, e, f, g, h, 0x27b70a85); + ROUND(33, h, a, b, c, d, e, f, g, 0x2e1b2138); + ROUND(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc); + ROUND(35, f, g, h, a, b, c, d, e, 0x53380d13); + ROUND(36, e, f, g, h, a, b, c, d, 0x650a7354); + ROUND(37, d, e, f, g, h, a, b, c, 0x766a0abb); + ROUND(38, c, d, e, f, g, h, a, b, 0x81c2c92e); + ROUND(39, b, c, d, e, f, g, h, a, 0x92722c85); + ROUND(40, a, b, c, d, e, f, g, h, 0xa2bfe8a1); + ROUND(41, h, a, b, c, d, e, f, g, 0xa81a664b); + ROUND(42, g, h, a, b, c, d, e, f, 0xc24b8b70); + ROUND(43, f, g, h, a, b, c, d, e, 0xc76c51a3); + ROUND(44, e, f, g, h, a, b, c, d, 0xd192e819); + ROUND(45, d, e, f, g, h, a, b, c, 0xd6990624); + ROUND(46, c, d, e, f, g, h, a, b, 0xf40e3585); + ROUND(47, b, c, d, e, f, g, h, a, 0x106aa070); + ROUND(48, a, b, c, d, e, f, g, h, 0x19a4c116); + ROUND(49, h, a, b, c, d, e, f, g, 0x1e376c08); + ROUND(50, g, h, a, b, c, d, e, f, 0x2748774c); + ROUND(51, f, g, h, a, b, c, d, e, 0x34b0bcb5); + ROUND(52, e, f, g, h, a, b, c, d, 0x391c0cb3); + ROUND(53, d, e, f, g, h, a, b, c, 0x4ed8aa4a); + ROUND(54, c, d, e, f, g, h, a, b, 0x5b9cca4f); + ROUND(55, b, c, d, e, f, g, h, a, 0x682e6ff3); + ROUND(56, a, b, c, d, e, f, g, h, 0x748f82ee); + ROUND(57, h, a, b, c, d, e, f, g, 0x78a5636f); + ROUND(58, g, h, a, b, c, d, e, f, 0x84c87814); + ROUND(59, f, g, h, a, b, c, d, e, 0x8cc70208); + ROUND(60, e, f, g, h, a, b, c, d, 0x90befffa); + ROUND(61, d, e, f, g, h, a, b, c, 0xa4506ceb); + ROUND(62, c, d, e, f, g, h, a, b, 0xbef9a3f7); + ROUND(63, b, c, d, e, f, g, h, a, 0xc67178f2); + + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + state[4] += e; + state[5] += f; + state[6] += g; + state[7] += h; + + block += SHA256_BLOCK_SIZE; + } + while (--count); + + #undef ROUND + #undef W + #undef Ch + #undef Maj + #undef BSig0 + #undef BSig1 + #undef SSig0 + #undef SSig1 +} + +void sha256_init(sha256_ctx* ctx) +{ + ctx->count = 0; + ctx->state[0] = 0x6a09e667; + ctx->state[1] = 0xbb67ae85; + ctx->state[2] = 0x3c6ef372; + ctx->state[3] = 0xa54ff53a; + ctx->state[4] = 0x510e527f; + ctx->state[5] = 0x9b05688c; + ctx->state[6] = 0x1f83d9ab; + ctx->state[7] = 0x5be0cd19; +} + +void sha256_update(sha256_ctx* ctx, const void* data, size_t size) +{ + const uint8_t* buffer = (const uint8_t*)data; + + size_t pending = ctx->count % SHA256_BLOCK_SIZE; + ctx->count += size; + + size_t available = SHA256_BLOCK_SIZE - pending; + if (pending && size >= available) + { + memcpy(ctx->buffer + pending, buffer, available); + sha256_process(ctx->state, ctx->buffer, 1); + buffer += available; + size -= available; + pending = 0; + } + + size_t count = size / SHA256_BLOCK_SIZE; + if (count) + { + sha256_process(ctx->state, buffer, count); + buffer += count * SHA256_BLOCK_SIZE; + size -= count * SHA256_BLOCK_SIZE; + } + + memcpy(ctx->buffer + pending, buffer, size); +} + +void sha256_finish(sha256_ctx* ctx, uint8_t digest[SHA256_DIGEST_SIZE]) +{ + uint64_t count = ctx->count; + uint64_t bitcount = count * 8; + + size_t pending = count % SHA256_BLOCK_SIZE; + size_t blocks = pending < SHA256_BLOCK_SIZE - sizeof(bitcount) ? 1 : 2; + + ctx->buffer[pending++] = 0x80; + + uint8_t padding[2 * SHA256_BLOCK_SIZE]; + memcpy(padding, ctx->buffer, SHA256_BLOCK_SIZE); + memset(padding + pending, 0, SHA256_BLOCK_SIZE); + SHA256_SET64BE(padding + blocks * SHA256_BLOCK_SIZE - sizeof(bitcount), bitcount); + + sha256_process(ctx->state, padding, blocks); + + for (size_t i=0; i<8; i++) + { + SHA256_SET32BE(digest + i*sizeof(uint32_t), ctx->state[i]); + } +} + +void sha224_init(sha224_ctx* ctx) +{ + ctx->count = 0; + ctx->state[0] = 0xc1059ed8; + ctx->state[1] = 0x367cd507; + ctx->state[2] = 0x3070dd17; + ctx->state[3] = 0xf70e5939; + ctx->state[4] = 0xffc00b31; + ctx->state[5] = 0x68581511; + ctx->state[6] = 0x64f98fa7; + ctx->state[7] = 0xbefa4fa4; +} + +void sha224_update(sha224_ctx* ctx, const void* data, size_t size) +{ + sha256_update(ctx, data, size); +} + +void sha224_finish(sha224_ctx* ctx, uint8_t digest[SHA224_DIGEST_SIZE]) +{ + uint8_t temp[SHA256_DIGEST_SIZE]; + sha256_finish(ctx, temp); + + memcpy(digest, temp, SHA224_DIGEST_SIZE); +} + +#if defined(__clang__) +# pragma clang diagnostic pop +#elif defined(_MSC_VER) +# pragma warning (pop) +#endif diff --git a/src/third_party/martins_hash/sha512.h b/src/third_party/martins_hash/sha512.h new file mode 100644 index 00000000..2a7dad07 --- /dev/null +++ b/src/third_party/martins_hash/sha512.h @@ -0,0 +1,508 @@ +#pragma once + +// https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf +// https://www.rfc-editor.org/rfc/rfc6234 + +#include +#include + +// +// interface +// + +#define SHA384_DIGEST_SIZE 48 +#define SHA512_DIGEST_SIZE 64 +#define SHA512_BLOCK_SIZE 128 + +typedef struct { + uint8_t buffer[SHA512_BLOCK_SIZE]; + uint64_t count[2]; + uint64_t state[8]; +} sha512_ctx; + +typedef sha512_ctx sha384_ctx; + +static inline void sha512_init(sha512_ctx* ctx); +static inline void sha512_update(sha512_ctx* ctx, const void* data, size_t size); +static inline void sha512_finish(sha512_ctx* ctx, uint8_t digest[SHA512_DIGEST_SIZE]); + +static inline void sha384_init(sha384_ctx* ctx); +static inline void sha384_update(sha384_ctx* ctx, const void* data, size_t size); +static inline void sha384_finish(sha384_ctx* ctx, uint8_t digest[SHA384_DIGEST_SIZE]); + +// +// implementation +// + +#include // memcpy, memset + +#if defined(__clang__) +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wcast-align" +# pragma clang diagnostic ignored "-Wunsafe-buffer-usage" +# pragma clang diagnostic ignored "-Wlanguage-extension-token" +# pragma clang diagnostic ignored "-Wdeclaration-after-statement" +#elif defined(_MSC_VER) +# pragma warning (push) +# pragma warning (disable : 4127) +#endif + +#if defined(__clang__) +# define SHA512_ROR64(x,n) __builtin_rotateright64(x, n) +#elif defined(_MSC_VER) +# include +# define SHA512_ROR64(x,n) _rotr64(x, n) +#else +# define SHA512_ROR64(x,n) ( ((x) >> (n)) | ((x) << (64-(n))) ) +#endif + +#if defined(_MSC_VER) +# include +# define SHA512_GET64BE(ptr) _byteswap_uint64( *((const _UNALIGNED uint64_t*)(ptr)) ) +# define SHA512_SET64BE(ptr,x) *((_UNALIGNED uint64_t*)(ptr)) = _byteswap_uint64(x) +#else +# define SHA512_GET64BE(ptr) \ + ( \ + ((uint64_t)((ptr)[0]) << 56) | \ + ((uint64_t)((ptr)[1]) << 48) | \ + ((uint64_t)((ptr)[2]) << 40) | \ + ((uint64_t)((ptr)[3]) << 32) | \ + ((uint64_t)((ptr)[4]) << 24) | \ + ((uint64_t)((ptr)[5]) << 16) | \ + ((uint64_t)((ptr)[6]) << 8) | \ + ((uint64_t)((ptr)[7]) << 0) \ + ) +# define SHA512_SET64BE(ptr, x) do \ + { \ + (ptr)[0] = (uint8_t)((x) >> 56); \ + (ptr)[1] = (uint8_t)((x) >> 48); \ + (ptr)[2] = (uint8_t)((x) >> 40); \ + (ptr)[3] = (uint8_t)((x) >> 32); \ + (ptr)[4] = (uint8_t)((x) >> 24); \ + (ptr)[5] = (uint8_t)((x) >> 16); \ + (ptr)[6] = (uint8_t)((x) >> 8); \ + (ptr)[7] = (uint8_t)((x) >> 0); \ + } \ + while (0) +#endif + +#if defined(__x86_64__) || defined(_M_AMD64) + +#include + +#if defined(__clang__) || defined(__GNUC__) +# include +# define SHA512_TARGET(str) __attribute__((target(str))) +# define SHA512_CPUID(x, info) __cpuid(x, info[0], info[1], info[2], info[3]) +# define SHA512_CPUID_EX(x, y, info) __cpuid_count(x, y, info[0], info[1], info[2], info[3]) +# define SHA512_XGETBV(x) __builtin_ia32_xgetbv(x) +#else +# include +# define SHA512_TARGET(str) +# define SHA512_CPUID(x, info) __cpuid(info, x) +# define SHA512_CPUID_EX(x, y, info) __cpuidex(info, x, y) +# define SHA512_XGETBV(x) _xgetbv(x) +#endif + +#define SHA512_CPUID_INIT (1 << 0) +#define SHA512_CPUID_VSHA512 (1 << 1) + +SHA512_TARGET("xsave") +static inline int sha512_cpuid(void) +{ + static int cpuid; + + int result = cpuid; + if (result == 0) + { + int info[4]; + + SHA256_CPUID(1, info); + int has_xsave = info[2] & (1 << 26); + + int has_ymm = 0; + if (has_xsave) + { + uint64_t xcr0 = SHA512_XGETBV(0); + has_ymm = xcr0 & (1 << 2); + } + + SHA256_CPUID_EX(7, 0, info); + int has_avx2 = info[1] & (1 << 5); + + SHA256_CPUID_EX(7, 1, info); + int has_sha512 = info[0] & (1 << 0); + + result |= SHA256_CPUID_INIT; + if (has_ymm && has_avx2 && has_sha512) + { + result |= SHA512_CPUID_VSHA512; + } + + cpuid = result; + } + +#if defined(SHA512_CPUID_MASK) + result &= SHA512_CPUID_MASK; +#endif + + return result; +} + +SHA512_TARGET("avx2,sha512") +static void sha512_process_vsha512(uint64_t* state, const uint8_t* block, size_t count) +{ + const __m256i* buffer = (const __m256i*)block; + + // to byteswap when doing big-ending load for message qwords + const __m256i bswap = _mm256_broadcastsi128_si256(_mm_setr_epi8(7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9,8)); + + static const uint64_t K[20][4] = + { + { 0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc }, + { 0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 }, + { 0xd807aa98a3030242, 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 }, + { 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, 0xc19bf174cf692694 }, + { 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 }, + { 0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 }, + { 0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, 0xbf597fc7beef0ee4 }, + { 0xc6e00bf33da88fc2, 0xd5a79147930aa725, 0x06ca6351e003826f, 0x142929670a0e6e70 }, + { 0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df }, + { 0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 0x92722c851482353b }, + { 0xa2bfe8a14cf10364, 0xa81a664bbc423001, 0xc24b8b70d0f89791, 0xc76c51a30654be30 }, + { 0xd192e819d6ef5218, 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8 }, + { 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 }, + { 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 }, + { 0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec }, + { 0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 0xc67178f2e372532b }, + { 0xca273eceea26619c, 0xd186b8c721c0c207, 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 }, + { 0x06f067aa72176fba, 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b }, + { 0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c }, + { 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 }, + }; + + #define W(i) w[(i)%4] + + // 4 wide round calculations + #define QROUND(i) do { \ + /* first four rounds loads input message */ \ + if (i < 4) W(i) = _mm256_shuffle_epi8(_mm256_loadu_si256(&buffer[i]), bswap); \ + /* add round constant */ \ + tmp = _mm256_add_epi64(W(i), _mm256_loadu_si256((const __m256i*)K[i])); \ + /* update previous message qwords for next rounds */ \ + if (i > 2 && i < 19) W(i-3) = _mm256_sha512msg2_epi64(_mm256_add_epi64(W(i-3), _mm256_permute4x64_epi64(_mm256_blend_epi32(W(i-1), W(i), 3), _MM_SHUFFLE(0,3,2,1))), W(i)); \ + if (i > 0 && i < 17) W(i-1) = _mm256_sha512msg1_epi64(W(i-1), _mm256_castsi256_si128(W(i))); \ + /* round functions */ \ + state1 = _mm256_sha512rnds2_epi64(state1, state0, _mm256_castsi256_si128(tmp)); \ + state0 = _mm256_sha512rnds2_epi64(state0, state1, _mm256_extracti128_si256(tmp, 1)); \ + } while(0) + + // load initial state + __m256i abcd = _mm256_permute4x64_epi64(_mm256_loadu_si256((const __m256i*)&state[0]), _MM_SHUFFLE(0,1,2,3)); // [a,b,c,d] + __m256i efgh = _mm256_permute4x64_epi64(_mm256_loadu_si256((const __m256i*)&state[4]), _MM_SHUFFLE(0,1,2,3)); // [e,f,g,h] + + // qword order for vsha512rnds2 instruction + __m256i state0 = _mm256_permute2x128_si256(efgh, abcd, (3 << 4) | 1); // [a,b,e,f] + __m256i state1 = _mm256_permute2x128_si256(efgh, abcd, (2 << 4) | 0); // [c,d,g,h] + + do + { + // remember current state + __m256i last0 = state0; + __m256i last1 = state1; + + __m256i tmp, w[4]; + + QROUND(0); + QROUND(1); + QROUND(2); + QROUND(3); + QROUND(4); + QROUND(5); + QROUND(6); + QROUND(7); + QROUND(8); + QROUND(9); + QROUND(10); + QROUND(11); + QROUND(12); + QROUND(13); + QROUND(14); + QROUND(15); + QROUND(16); + QROUND(17); + QROUND(18); + QROUND(19); + + // update next state + state0 = _mm256_add_epi64(state0, last0); + state1 = _mm256_add_epi64(state1, last1); + + buffer += 4; + } + while (--count); + + // restore qword order + abcd = _mm256_permute2x128_si256(state1, state0, (3 << 4) | 1); + efgh = _mm256_permute2x128_si256(state1, state0, (2 << 4) | 0); + + // save the new state + _mm256_storeu_si256((__m256i*)&state[0], _mm256_permute4x64_epi64(abcd, _MM_SHUFFLE(0,1,2,3))); + _mm256_storeu_si256((__m256i*)&state[4], _mm256_permute4x64_epi64(efgh, _MM_SHUFFLE(0,1,2,3))); + + #undef QROUND + #undef W +} + +#endif // defined(__x86_64__) || defined(_M_AMD64) + +static void sha512_process(uint64_t* state, const uint8_t* block, size_t count) +{ +#if defined(__x86_64__) || defined(_M_AMD64) + int cpuid = sha512_cpuid(); + if (cpuid & SHA512_CPUID_VSHA512) + { + sha512_process_vsha512(state, block, count); + return; + } +#endif + + #define Ch(x,y,z) ((x & (y ^ z)) ^ z) + #define Maj(x,y,z) ((x & y) | (z & (x | y))) + + #define BSig0(x) (SHA512_ROR64(x, 28) ^ SHA512_ROR64(x, 34) ^ SHA512_ROR64(x, 39)) + #define BSig1(x) (SHA512_ROR64(x, 14) ^ SHA512_ROR64(x, 18) ^ SHA512_ROR64(x, 41)) + #define SSig0(x) (SHA512_ROR64(x, 1) ^ SHA512_ROR64(x, 8) ^ (x >> 7)) + #define SSig1(x) (SHA512_ROR64(x, 19) ^ SHA512_ROR64(x, 61) ^ (x >> 6)) + + #define W(i) w[(i+16)%16] + + #define ROUND(i,a,b,c,d,e,f,g,h,K) do \ + { \ + uint64_t w0; \ + if (i < 16) W(i) = w0 = SHA512_GET64BE(block + i*sizeof(uint64_t)); \ + if (i >= 16) W(i) = w0 = SSig1(W(i-2)) + W(i-7) + SSig0(W(i-15)) + W(i-16); \ + \ + uint64_t t1 = h + BSig1(e) + Ch(e,f,g) + K + w0; \ + uint64_t t2 = BSig0(a) + Maj(a,b,c); \ + d += t1; \ + h = t1 + t2; \ + } while (0) + + do + { + uint64_t a = state[0]; + uint64_t b = state[1]; + uint64_t c = state[2]; + uint64_t d = state[3]; + uint64_t e = state[4]; + uint64_t f = state[5]; + uint64_t g = state[6]; + uint64_t h = state[7]; + + uint64_t w[16]; + + ROUND( 0, a, b, c, d, e, f, g, h, 0x428a2f98d728ae22); + ROUND( 1, h, a, b, c, d, e, f, g, 0x7137449123ef65cd); + ROUND( 2, g, h, a, b, c, d, e, f, 0xb5c0fbcfec4d3b2f); + ROUND( 3, f, g, h, a, b, c, d, e, 0xe9b5dba58189dbbc); + ROUND( 4, e, f, g, h, a, b, c, d, 0x3956c25bf348b538); + ROUND( 5, d, e, f, g, h, a, b, c, 0x59f111f1b605d019); + ROUND( 6, c, d, e, f, g, h, a, b, 0x923f82a4af194f9b); + ROUND( 7, b, c, d, e, f, g, h, a, 0xab1c5ed5da6d8118); + ROUND( 8, a, b, c, d, e, f, g, h, 0xd807aa98a3030242); + ROUND( 9, h, a, b, c, d, e, f, g, 0x12835b0145706fbe); + ROUND(10, g, h, a, b, c, d, e, f, 0x243185be4ee4b28c); + ROUND(11, f, g, h, a, b, c, d, e, 0x550c7dc3d5ffb4e2); + ROUND(12, e, f, g, h, a, b, c, d, 0x72be5d74f27b896f); + ROUND(13, d, e, f, g, h, a, b, c, 0x80deb1fe3b1696b1); + ROUND(14, c, d, e, f, g, h, a, b, 0x9bdc06a725c71235); + ROUND(15, b, c, d, e, f, g, h, a, 0xc19bf174cf692694); + ROUND(16, a, b, c, d, e, f, g, h, 0xe49b69c19ef14ad2); + ROUND(17, h, a, b, c, d, e, f, g, 0xefbe4786384f25e3); + ROUND(18, g, h, a, b, c, d, e, f, 0x0fc19dc68b8cd5b5); + ROUND(19, f, g, h, a, b, c, d, e, 0x240ca1cc77ac9c65); + ROUND(20, e, f, g, h, a, b, c, d, 0x2de92c6f592b0275); + ROUND(21, d, e, f, g, h, a, b, c, 0x4a7484aa6ea6e483); + ROUND(22, c, d, e, f, g, h, a, b, 0x5cb0a9dcbd41fbd4); + ROUND(23, b, c, d, e, f, g, h, a, 0x76f988da831153b5); + ROUND(24, a, b, c, d, e, f, g, h, 0x983e5152ee66dfab); + ROUND(25, h, a, b, c, d, e, f, g, 0xa831c66d2db43210); + ROUND(26, g, h, a, b, c, d, e, f, 0xb00327c898fb213f); + ROUND(27, f, g, h, a, b, c, d, e, 0xbf597fc7beef0ee4); + ROUND(28, e, f, g, h, a, b, c, d, 0xc6e00bf33da88fc2); + ROUND(29, d, e, f, g, h, a, b, c, 0xd5a79147930aa725); + ROUND(30, c, d, e, f, g, h, a, b, 0x06ca6351e003826f); + ROUND(31, b, c, d, e, f, g, h, a, 0x142929670a0e6e70); + ROUND(32, a, b, c, d, e, f, g, h, 0x27b70a8546d22ffc); + ROUND(33, h, a, b, c, d, e, f, g, 0x2e1b21385c26c926); + ROUND(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc5ac42aed); + ROUND(35, f, g, h, a, b, c, d, e, 0x53380d139d95b3df); + ROUND(36, e, f, g, h, a, b, c, d, 0x650a73548baf63de); + ROUND(37, d, e, f, g, h, a, b, c, 0x766a0abb3c77b2a8); + ROUND(38, c, d, e, f, g, h, a, b, 0x81c2c92e47edaee6); + ROUND(39, b, c, d, e, f, g, h, a, 0x92722c851482353b); + ROUND(40, a, b, c, d, e, f, g, h, 0xa2bfe8a14cf10364); + ROUND(41, h, a, b, c, d, e, f, g, 0xa81a664bbc423001); + ROUND(42, g, h, a, b, c, d, e, f, 0xc24b8b70d0f89791); + ROUND(43, f, g, h, a, b, c, d, e, 0xc76c51a30654be30); + ROUND(44, e, f, g, h, a, b, c, d, 0xd192e819d6ef5218); + ROUND(45, d, e, f, g, h, a, b, c, 0xd69906245565a910); + ROUND(46, c, d, e, f, g, h, a, b, 0xf40e35855771202a); + ROUND(47, b, c, d, e, f, g, h, a, 0x106aa07032bbd1b8); + ROUND(48, a, b, c, d, e, f, g, h, 0x19a4c116b8d2d0c8); + ROUND(49, h, a, b, c, d, e, f, g, 0x1e376c085141ab53); + ROUND(50, g, h, a, b, c, d, e, f, 0x2748774cdf8eeb99); + ROUND(51, f, g, h, a, b, c, d, e, 0x34b0bcb5e19b48a8); + ROUND(52, e, f, g, h, a, b, c, d, 0x391c0cb3c5c95a63); + ROUND(53, d, e, f, g, h, a, b, c, 0x4ed8aa4ae3418acb); + ROUND(54, c, d, e, f, g, h, a, b, 0x5b9cca4f7763e373); + ROUND(55, b, c, d, e, f, g, h, a, 0x682e6ff3d6b2b8a3); + ROUND(56, a, b, c, d, e, f, g, h, 0x748f82ee5defb2fc); + ROUND(57, h, a, b, c, d, e, f, g, 0x78a5636f43172f60); + ROUND(58, g, h, a, b, c, d, e, f, 0x84c87814a1f0ab72); + ROUND(59, f, g, h, a, b, c, d, e, 0x8cc702081a6439ec); + ROUND(60, e, f, g, h, a, b, c, d, 0x90befffa23631e28); + ROUND(61, d, e, f, g, h, a, b, c, 0xa4506cebde82bde9); + ROUND(62, c, d, e, f, g, h, a, b, 0xbef9a3f7b2c67915); + ROUND(63, b, c, d, e, f, g, h, a, 0xc67178f2e372532b); + ROUND(64, a, b, c, d, e, f, g, h, 0xca273eceea26619c); + ROUND(65, h, a, b, c, d, e, f, g, 0xd186b8c721c0c207); + ROUND(66, g, h, a, b, c, d, e, f, 0xeada7dd6cde0eb1e); + ROUND(67, f, g, h, a, b, c, d, e, 0xf57d4f7fee6ed178); + ROUND(68, e, f, g, h, a, b, c, d, 0x06f067aa72176fba); + ROUND(69, d, e, f, g, h, a, b, c, 0x0a637dc5a2c898a6); + ROUND(70, c, d, e, f, g, h, a, b, 0x113f9804bef90dae); + ROUND(71, b, c, d, e, f, g, h, a, 0x1b710b35131c471b); + ROUND(72, a, b, c, d, e, f, g, h, 0x28db77f523047d84); + ROUND(73, h, a, b, c, d, e, f, g, 0x32caab7b40c72493); + ROUND(74, g, h, a, b, c, d, e, f, 0x3c9ebe0a15c9bebc); + ROUND(75, f, g, h, a, b, c, d, e, 0x431d67c49c100d4c); + ROUND(76, e, f, g, h, a, b, c, d, 0x4cc5d4becb3e42b6); + ROUND(77, d, e, f, g, h, a, b, c, 0x597f299cfc657e2a); + ROUND(78, c, d, e, f, g, h, a, b, 0x5fcb6fab3ad6faec); + ROUND(79, b, c, d, e, f, g, h, a, 0x6c44198c4a475817); + + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + state[4] += e; + state[5] += f; + state[6] += g; + state[7] += h; + + block += SHA512_BLOCK_SIZE; + } + while (--count); + + #undef ROUND + #undef W + #undef Ch + #undef Maj + #undef BSig0 + #undef BSig1 + #undef SSig0 + #undef SSig1 +} + +void sha512_init(sha512_ctx* ctx) +{ + ctx->count[0] = 0; + ctx->count[1] = 0; + ctx->state[0] = 0x6a09e667f3bcc908; + ctx->state[1] = 0xbb67ae8584caa73b; + ctx->state[2] = 0x3c6ef372fe94f82b; + ctx->state[3] = 0xa54ff53a5f1d36f1; + ctx->state[4] = 0x510e527fade682d1; + ctx->state[5] = 0x9b05688c2b3e6c1f; + ctx->state[6] = 0x1f83d9abfb41bd6b; + ctx->state[7] = 0x5be0cd19137e2179; +} + +void sha512_update(sha512_ctx* ctx, const void* data, size_t size) +{ + const uint8_t* buffer = (const uint8_t*)data; + + size_t pending = ctx->count[0] % SHA512_BLOCK_SIZE; + ctx->count[0] += size; + ctx->count[1] += size > ctx->count[0]; + + size_t available = SHA512_BLOCK_SIZE - pending; + if (pending && size >= available) + { + memcpy(ctx->buffer + pending, buffer, available); + sha512_process(ctx->state, ctx->buffer, 1); + buffer += available; + size -= available; + pending = 0; + } + + size_t count = size / SHA512_BLOCK_SIZE; + if (count) + { + sha512_process(ctx->state, buffer, count); + buffer += count * SHA512_BLOCK_SIZE; + size -= count * SHA512_BLOCK_SIZE; + } + + memcpy(ctx->buffer + pending, buffer, size); +} + +void sha512_finish(sha512_ctx* ctx, uint8_t digest[SHA512_DIGEST_SIZE]) +{ + uint64_t count0 = ctx->count[0]; + uint64_t count1 = ctx->count[1]; + uint64_t bitcount[2] = { (count0 << 3), (count1 << 3) | (count0 >> 61) }; + + size_t pending = count0 % SHA512_BLOCK_SIZE; + size_t blocks = pending < SHA512_BLOCK_SIZE - sizeof(bitcount) ? 1 : 2; + + ctx->buffer[pending++] = 0x80; + + uint8_t padding[2 * SHA512_BLOCK_SIZE]; + memcpy(padding, ctx->buffer, SHA512_BLOCK_SIZE); + memset(padding + pending, 0, SHA512_BLOCK_SIZE); + SHA512_SET64BE(padding + blocks * SHA512_BLOCK_SIZE - 2*sizeof(uint64_t), bitcount[1]); + SHA512_SET64BE(padding + blocks * SHA512_BLOCK_SIZE - 1*sizeof(uint64_t), bitcount[0]); + + sha512_process(ctx->state, padding, blocks); + + for (size_t i=0; i<8; i++) + { + SHA512_SET64BE(digest + i*sizeof(uint64_t), ctx->state[i]); + } +} + +void sha384_init(sha384_ctx* ctx) +{ + ctx->count[0] = 0; + ctx->count[1] = 0; + ctx->state[0] = 0xcbbb9d5dc1059ed8; + ctx->state[1] = 0x629a292a367cd507; + ctx->state[2] = 0x9159015a3070dd17; + ctx->state[3] = 0x152fecd8f70e5939; + ctx->state[4] = 0x67332667ffc00b31; + ctx->state[5] = 0x8eb44a8768581511; + ctx->state[6] = 0xdb0c2e0d64f98fa7; + ctx->state[7] = 0x47b5481dbefa4fa4; +} + +void sha384_update(sha512_ctx* ctx, const void* data, size_t size) +{ + sha512_update(ctx, data, size); +} + +void sha384_finish(sha384_ctx* ctx, uint8_t digest[SHA384_DIGEST_SIZE]) +{ + uint8_t temp[SHA512_DIGEST_SIZE]; + sha512_finish(ctx, temp); + + memcpy(digest, temp, SHA384_DIGEST_SIZE); +} + +#if defined(__clang__) +# pragma clang diagnostic pop +#elif defined(_MSC_VER) +# pragma warning (pop) +#endif diff --git a/src/third_party/tomcrypt_hash/tomcrypt_hash.h b/src/third_party/tomcrypt_hash/tomcrypt_hash.h deleted file mode 100644 index 07770f01..00000000 --- a/src/third_party/tomcrypt_hash/tomcrypt_hash.h +++ /dev/null @@ -1,567 +0,0 @@ -// This is a collection of code originally sourced from LibTomCrypt, located at -// https://github.com/libtom/libtomcrypt, released under the following license: -// -// --- -// -// The LibTom license -// -// This is free and unencumbered software released into the public domain. -// -// Anyone is free to copy, modify, publish, use, compile, sell, or -// distribute this software, either in source code form or as a compiled -// binary, for any purpose, commercial or non-commercial, and by any -// means. -// -// In jurisdictions that recognize copyright laws, the author or authors -// of this software dedicate any and all copyright interest in the -// software to the public domain. We make this dedication for the benefit -// of the public at large and to the detriment of our heirs and -// successors. We intend this dedication to be an overt act of -// relinquishment in perpetuity of all present and future rights to this -// software under copyright law. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -// IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR -// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -// OTHER DEALINGS IN THE SOFTWARE. -// -// For more information, please refer to -// -// --- -// -// The code has been narrowed down and slightly modified, to include only the -// things that the RAD Debugger project needs, and to work with the project's -// build structure cleanly. - -#ifndef TOMCRYPT_HASH_H -#define TOMCRYPT_HASH_H - -//////////////////////////////// -//~ rjf: Common Helpers - -#define CRYPT_OK 1 - -#define LOAD32H(x, y) \ -do { x = ((U32)((y)[0] & 255)<<24) | \ -((U32)((y)[1] & 255)<<16) | \ -((U32)((y)[2] & 255)<<8) | \ -((U32)((y)[3] & 255)); } while(0) - -#define STORE32H(x, y) \ -do { (y)[0] = (unsigned char)(((x)>>24)&255); (y)[1] = (unsigned char)(((x)>>16)&255); \ -(y)[2] = (unsigned char)(((x)>>8)&255); (y)[3] = (unsigned char)((x)&255); } while(0) - -#define STORE64H(x, y) \ -do { (y)[0] = (unsigned char)(((x)>>56)&255); (y)[1] = (unsigned char)(((x)>>48)&255); \ -(y)[2] = (unsigned char)(((x)>>40)&255); (y)[3] = (unsigned char)(((x)>>32)&255); \ -(y)[4] = (unsigned char)(((x)>>24)&255); (y)[5] = (unsigned char)(((x)>>16)&255); \ -(y)[6] = (unsigned char)(((x)>>8)&255); (y)[7] = (unsigned char)((x)&255); } while(0) - -#define LTC_TMPVAR__(n, l) n ## l -#define LTC_TMPVAR_(n, l) LTC_TMPVAR__(n, l) -#define LTC_TMPVAR(n) LTC_TMPVAR_(LTC_ ## n ## _, __LINE__) - -#define ROL(x, y) ( (((U32)(x)<<(U32)((y)&31)) | (((U32)(x)&0xFFFFFFFFUL)>>(U32)((32-((y)&31))&31))) & 0xFFFFFFFFUL) -#define ROR(x, y) ( ((((U32)(x)&0xFFFFFFFFUL)>>(U32)((y)&31)) | ((U32)(x)<<(U32)((32-((y)&31))&31))) & 0xFFFFFFFFUL) -#define ROLc(x, y) ( (((U32)(x)<<(U32)((y)&31)) | (((U32)(x)&0xFFFFFFFFUL)>>(U32)((32-((y)&31))&31))) & 0xFFFFFFFFUL) -#define RORc(x, y) ( ((((U32)(x)&0xFFFFFFFFUL)>>(U32)((y)&31)) | ((U32)(x)<<(U32)((32-((y)&31))&31))) & 0xFFFFFFFFUL) - -#define MIN(x, y) ( ((x)<(y))?(x):(y) ) - -//////////////////////////////// -//~ rjf: SHA256 - -typedef struct SHA256State SHA256State; -struct SHA256State -{ - U64 length; - U32 state[8], curlen; - U8 buf[64]; -}; - -/* Various logical functions */ -#define Ch(x,y,z) (z ^ (x & (y ^ z))) -#define Maj(x,y,z) (((x | y) & z) | (x & y)) -#define S(x, n) RORc((x),(n)) -#define R(x, n) (((x)&0xFFFFFFFFUL)>>(n)) -#define Sigma0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22)) -#define Sigma1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25)) -#define Gamma0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3)) -#define Gamma1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10)) - -/* compress 512-bits */ -static int s_sha256_compress(SHA256State *state, const unsigned char *buf) -{ - U32 S[8], W[64], t0, t1; - int i; - - /* copy state into S */ - for (i = 0; i < 8; i++) { - S[i] = state->state[i]; - } - - /* copy the state into 512-bits into W[0..15] */ - for (i = 0; i < 16; i++) { - LOAD32H(W[i], buf + (4*i)); - } - - /* fill W[16..63] */ - for (i = 16; i < 64; i++) { - W[i] = Gamma1(W[i - 2]) + W[i - 7] + Gamma0(W[i - 15]) + W[i - 16]; - } - - /* Compress */ -#define RND(a,b,c,d,e,f,g,h,i,ki) \ -t0 = h + Sigma1(e) + Ch(e, f, g) + ki + W[i]; \ -t1 = Sigma0(a) + Maj(a, b, c); \ -d += t0; \ -h = t0 + t1; - RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],0,0x428a2f98); - RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],1,0x71374491); - RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],2,0xb5c0fbcf); - RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],3,0xe9b5dba5); - RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],4,0x3956c25b); - RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],5,0x59f111f1); - RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],6,0x923f82a4); - RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],7,0xab1c5ed5); - RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],8,0xd807aa98); - RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],9,0x12835b01); - RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],10,0x243185be); - RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],11,0x550c7dc3); - RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],12,0x72be5d74); - RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],13,0x80deb1fe); - RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],14,0x9bdc06a7); - RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],15,0xc19bf174); - RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],16,0xe49b69c1); - RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],17,0xefbe4786); - RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],18,0x0fc19dc6); - RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],19,0x240ca1cc); - RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],20,0x2de92c6f); - RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],21,0x4a7484aa); - RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],22,0x5cb0a9dc); - RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],23,0x76f988da); - RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],24,0x983e5152); - RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],25,0xa831c66d); - RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],26,0xb00327c8); - RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],27,0xbf597fc7); - RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],28,0xc6e00bf3); - RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],29,0xd5a79147); - RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],30,0x06ca6351); - RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],31,0x14292967); - RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],32,0x27b70a85); - RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],33,0x2e1b2138); - RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],34,0x4d2c6dfc); - RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],35,0x53380d13); - RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],36,0x650a7354); - RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],37,0x766a0abb); - RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],38,0x81c2c92e); - RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],39,0x92722c85); - RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],40,0xa2bfe8a1); - RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],41,0xa81a664b); - RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],42,0xc24b8b70); - RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],43,0xc76c51a3); - RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],44,0xd192e819); - RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],45,0xd6990624); - RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],46,0xf40e3585); - RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],47,0x106aa070); - RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],48,0x19a4c116); - RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],49,0x1e376c08); - RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],50,0x2748774c); - RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],51,0x34b0bcb5); - RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],52,0x391c0cb3); - RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],53,0x4ed8aa4a); - RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],54,0x5b9cca4f); - RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],55,0x682e6ff3); - RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],56,0x748f82ee); - RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],57,0x78a5636f); - RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],58,0x84c87814); - RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],59,0x8cc70208); - RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],60,0x90befffa); - RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],61,0xa4506ceb); - RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],62,0xbef9a3f7); - RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],63,0xc67178f2); -#undef RND - - /* feedback */ - for (i = 0; i < 8; i++) { - state->state[i] = state->state[i] + S[i]; - } - return CRYPT_OK; -} - -/** - Initialize the hash state - @param md The hash state you wish to initialize - @return CRYPT_OK if successful -*/ -int sha256_init(SHA256State *state) -{ - state->curlen = 0; - state->length = 0; - state->state[0] = 0x6A09E667UL; - state->state[1] = 0xBB67AE85UL; - state->state[2] = 0x3C6EF372UL; - state->state[3] = 0xA54FF53AUL; - state->state[4] = 0x510E527FUL; - state->state[5] = 0x9B05688CUL; - state->state[6] = 0x1F83D9ABUL; - state->state[7] = 0x5BE0CD19UL; - return CRYPT_OK; -} - -/** - Process a block of memory though the hash - @param md The hash state - @param in The data to hash - @param inlen The length of the data (octets) - @return CRYPT_OK if successful -*/ - -int sha256_process(SHA256State *state, const unsigned char *in, unsigned long inlen) -{ - unsigned long n; - int err; - int block_size = 64; - if(state->curlen > sizeof(state->buf)) - { - return 0; // CRYPT_INVALID_ARG - } - if(((state->length + inlen * 8) < state->length) || ((inlen * 8) < inlen)) - { - return 0; // CRYPT_HASH_OVERFLOW - } - while(inlen > 0) - { - if(state->curlen == 0 && inlen >= block_size) - { - if ((err = s_sha256_compress(state, in)) != CRYPT_OK) - { - return err; - } - state->length += block_size * 8; - in += block_size; - inlen -= block_size; - } else { - n = MIN(inlen, (block_size - state->curlen)); - MemoryCopy(state->buf + state->curlen, in, (size_t)n); - state->curlen += n; - in += n; - inlen -= n; - if(state->curlen == block_size) - { - if((err = s_sha256_compress(state, state->buf)) != CRYPT_OK) - { - return err; - } - state->length += 8*block_size; - state->curlen = 0; - } - } - } - return CRYPT_OK; -} - -/** - Terminate the hash to get the digest - @param md The hash state - @param out [out] The destination of the hash (32 bytes) - @return CRYPT_OK if successful -*/ -int sha256_done(SHA256State *state, unsigned char *out) -{ - int i; - - if (state->curlen >= sizeof(state->buf)) { - return 0; // CRYPT_INVALID_ARG - } - - - /* increase the length of the message */ - state->length += state->curlen * 8; - - /* append the '1' bit */ - state->buf[state->curlen++] = (unsigned char)0x80; - - /* if the length is currently above 56 bytes we append zeros - * then compress. Then we can fall back to padding zeros and length - * encoding like normal. - */ - if (state->curlen > 56) { - while (state->curlen < 64) { - state->buf[state->curlen++] = (unsigned char)0; - } - s_sha256_compress(state, state->buf); - state->curlen = 0; - } - - /* pad upto 56 bytes of zeroes */ - while (state->curlen < 56) { - state->buf[state->curlen++] = (unsigned char)0; - } - - /* store length */ - STORE64H(state->length, state->buf+56); - s_sha256_compress(state, state->buf); - - /* copy output */ - for (i = 0; i < 8; i++) { - STORE32H(state->state[i], out+(4*i)); - } - return CRYPT_OK; -} - -#undef Ch -#undef Maj -#undef S -#undef R -#undef Sigma0 -#undef Sigma1 -#undef Gamma0 -#undef Gamma1 - -//////////////////////////////// -//~ rjf: SHA1 - -typedef struct SHA1State SHA1State; -struct SHA1State -{ - U64 length; - U32 state[5], curlen; - unsigned char buf[64]; -}; - -#define F0(x,y,z) (z ^ (x & (y ^ z))) -#define F1(x,y,z) (x ^ y ^ z) -#define F2(x,y,z) ((x & y) | (z & (x | y))) -#define F3(x,y,z) (x ^ y ^ z) - -static int s_sha1_compress(SHA1State *state, const unsigned char *buf) -{ - U32 a,b,c,d,e,W[80],i; - - /* copy the state into 512-bits into W[0..15] */ - for (i = 0; i < 16; i++) { - LOAD32H(W[i], buf + (4*i)); - } - - /* copy state */ - a = state->state[0]; - b = state->state[1]; - c = state->state[2]; - d = state->state[3]; - e = state->state[4]; - - /* expand it */ - for (i = 16; i < 80; i++) { - W[i] = ROL(W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1); - } - - /* compress */ - /* round one */ -#define FF0(a,b,c,d,e,i) e = (ROLc(a, 5) + F0(b,c,d) + e + W[i] + 0x5a827999UL); b = ROLc(b, 30); -#define FF1(a,b,c,d,e,i) e = (ROLc(a, 5) + F1(b,c,d) + e + W[i] + 0x6ed9eba1UL); b = ROLc(b, 30); -#define FF2(a,b,c,d,e,i) e = (ROLc(a, 5) + F2(b,c,d) + e + W[i] + 0x8f1bbcdcUL); b = ROLc(b, 30); -#define FF3(a,b,c,d,e,i) e = (ROLc(a, 5) + F3(b,c,d) + e + W[i] + 0xca62c1d6UL); b = ROLc(b, 30); - -#ifdef LTC_SMALL_CODE - - for (i = 0; i < 20; ) { - FF0(a,b,c,d,e,i++); t = e; e = d; d = c; c = b; b = a; a = t; - } - - for (; i < 40; ) { - FF1(a,b,c,d,e,i++); t = e; e = d; d = c; c = b; b = a; a = t; - } - - for (; i < 60; ) { - FF2(a,b,c,d,e,i++); t = e; e = d; d = c; c = b; b = a; a = t; - } - - for (; i < 80; ) { - FF3(a,b,c,d,e,i++); t = e; e = d; d = c; c = b; b = a; a = t; - } - -#else - - for (i = 0; i < 20; ) { - FF0(a,b,c,d,e,i++); - FF0(e,a,b,c,d,i++); - FF0(d,e,a,b,c,i++); - FF0(c,d,e,a,b,i++); - FF0(b,c,d,e,a,i++); - } - - /* round two */ - for (; i < 40; ) { - FF1(a,b,c,d,e,i++); - FF1(e,a,b,c,d,i++); - FF1(d,e,a,b,c,i++); - FF1(c,d,e,a,b,i++); - FF1(b,c,d,e,a,i++); - } - - /* round three */ - for (; i < 60; ) { - FF2(a,b,c,d,e,i++); - FF2(e,a,b,c,d,i++); - FF2(d,e,a,b,c,i++); - FF2(c,d,e,a,b,i++); - FF2(b,c,d,e,a,i++); - } - - /* round four */ - for (; i < 80; ) { - FF3(a,b,c,d,e,i++); - FF3(e,a,b,c,d,i++); - FF3(d,e,a,b,c,i++); - FF3(c,d,e,a,b,i++); - FF3(b,c,d,e,a,i++); - } -#endif - -#undef FF0 -#undef FF1 -#undef FF2 -#undef FF3 - - /* store */ - state->state[0] = state->state[0] + a; - state->state[1] = state->state[1] + b; - state->state[2] = state->state[2] + c; - state->state[3] = state->state[3] + d; - state->state[4] = state->state[4] + e; - - return CRYPT_OK; -} - -/** - Initialize the hash state - @param md The hash state you wish to initialize - @return CRYPT_OK if successful -*/ -int sha1_init(SHA1State *state) -{ - state->state[0] = 0x67452301UL; - state->state[1] = 0xefcdab89UL; - state->state[2] = 0x98badcfeUL; - state->state[3] = 0x10325476UL; - state->state[4] = 0xc3d2e1f0UL; - state->curlen = 0; - state->length = 0; - return CRYPT_OK; -} - -/** - Process a block of memory though the hash - @param md The hash state - @param in The data to hash - @param inlen The length of the data (octets) - @return CRYPT_OK if successful -*/ -// HASH_PROCESS(sha1_process, s_sha1_compress, sha1, 64) -int sha1_process(SHA1State *state, const unsigned char *in, unsigned long inlen) -{ - unsigned long n; - int err; - int block_size = 64; - if(state->curlen > sizeof(state->buf)) - { - return 0; // CRYPT_INVALID_ARG - } - if(((state->length + inlen * 8) < state->length) || ((inlen * 8) < inlen)) - { - return 0; // CRYPT_HASH_OVERFLOW - } - while(inlen > 0) - { - if(state->curlen == 0 && inlen >= block_size) - { - if ((err = s_sha1_compress(state, in)) != CRYPT_OK) - { - return err; - } - state->length += block_size * 8; - in += block_size; - inlen -= block_size; - } else { - n = MIN(inlen, (block_size - state->curlen)); - MemoryCopy(state->buf + state->curlen, in, (size_t)n); - state->curlen += n; - in += n; - inlen -= n; - if(state->curlen == block_size) - { - if((err = s_sha1_compress(state, state->buf)) != CRYPT_OK) - { - return err; - } - state->length += 8*block_size; - state->curlen = 0; - } - } - } - return CRYPT_OK; -} - - -/** - Terminate the hash to get the digest - @param md The hash state - @param out [out] The destination of the hash (20 bytes) - @return CRYPT_OK if successful -*/ -int sha1_done(SHA1State *state, unsigned char *out) -{ - int i; - - if (state->curlen >= sizeof(state->buf)) { - return 0; // CRYPT_INVALID_ARG; - } - - /* increase the length of the message */ - state->length += state->curlen * 8; - - /* append the '1' bit */ - state->buf[state->curlen++] = (unsigned char)0x80; - - /* if the length is currently above 56 bytes we append zeros - * then compress. Then we can fall back to padding zeros and length - * encoding like normal. - */ - if (state->curlen > 56) { - while (state->curlen < 64) { - state->buf[state->curlen++] = (unsigned char)0; - } - s_sha1_compress(state, state->buf); - state->curlen = 0; - } - - /* pad upto 56 bytes of zeroes */ - while (state->curlen < 56) { - state->buf[state->curlen++] = (unsigned char)0; - } - - /* store length */ - STORE64H(state->length, state->buf+56); - s_sha1_compress(state, state->buf); - - /* copy output */ - for (i = 0; i < 5; i++) { - STORE32H(state->state[i], out+(4*i)); - } - return CRYPT_OK; -} - -#undef F0 -#undef F1 -#undef F2 -#undef F3 -#undef FF0 -#undef FF1 -#undef FF2 -#undef FF3 - -#endif // TOMCRYPT_HASH_H